I have 700 items and I loop through the 700 items for each I obtain the item' three attributes and perform some basic calculations. I have implemented this using two techniques:
1) Three 700-element arrays, one array for each of the three attributes. So:
item0.a = array1[0]
item0.b = array2[0]
item0.e = array3[0]
2) One 2100-element array containing data for the three attributes consecutively. So:
item0.a = array[(0*3)+0]
item0.b = array[(0*3)+1]
item0.e = array[(0*3)+2]
Now the three item attributes a
, b
and e
are used together within the loop- therefore it would make sense that if you store them in one array the performance should be better than if you use the three-array technique (due to spatial locality). However:
- Three 700-element arrays = 3300 CPU cycles on average for the whole loop
- One 2100-element array = 3500 CPU cycles on average for the whole loop
Here is the code for the 2100-array technique:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array[2100];
//I have left out code for simplicity. You can assume by now the array is populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned short j = i * 3;
unsigned int a = array[j + 0];
unsigned int b = array[j + 1];
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array[j + 2];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
and here is the code for the three 700-element arrays technique:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array1[700];
unsigned int array2[700];
unsigned int array3[700];
//I have left out code for simplicity. You can assume by now the arrays are populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned int a= array1[i]; //Array 1
unsigned int b= array2[i]; //Array 2
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array3[i]; //Array 3
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
Why isn't the technique using one-2100 element array faster? It should be because the three attributes are used together, per each 700 item.
I used MSVC 2012, Win 7 64
Assembly for 3x 700-element array technique:
start = __rdtscp(&x);
rdtscp
shl rdx,20h
lea r8,[this]
or rax,rdx
mov dword ptr [r8],ecx
mov r8d,8ch
mov r9,rax
lea rdx,[rbx+0Ch]
for(int i=0; i < 700; i++){
sub rdi,rbx
unsigned int a = array1[i];
unsigned int b = array2[i];
data_for_all_items = data_for_all_items & (a != -1 & b != -1);
cmp dword ptr [rdi+rdx-0Ch],0FFFFFFFFh
lea rdx,[rdx+14h]
setne cl
cmp dword ptr [rdi+rdx-1Ch],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-18h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-10h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-14h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-20h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-1Ch],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-18h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-10h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-14h],0FFFFFFFFh
setne al
and cl,al
and r15b,cl
dec r8
jne 013F26DA53h
unsigned int e = array3[i];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
Assembler for the 2100-element array technique:
start = __rdtscp(&x);
rdtscp
lea r8,[this]
shl rdx,20h
or rax,rdx
mov dword ptr [r8],ecx
for(int i=0; i < 700; i++){
xor r8d,r8d
mov r10,rax
unsigned short j = i*3;
movzx ecx,r8w
add cx,cx
lea edx,[rcx+r8]
unsigned int a = array[j + 0];
unsigned int b = array[j + 1];
data_for_all_items = data_for_all_items & (best_ask != -1 & best_bid != -1);
movzx ecx,dx
cmp dword ptr [r9+rcx*4+4],0FFFFFFFFh
setne dl
cmp dword ptr [r9+rcx*4],0FFFFFFFFh
setne al
inc r8d
and dl,al
and r14b,dl
cmp r8d,2BCh
jl 013F05DA10h
unsigned int e = array[pos + 2];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx