Set-up: Intel Ivy Bridge Core i7, compiling in 64-bit mode, MSVC(2012) and Win 7 64-bit.
I am trying to understand whether atomic increments causes cache misses.
I set up a test where an atomic variable and another variable were in the same cache line and not in the same cache line and then compared cache misses. Code and results below.
Results
Different cache lines:
- Atomic increment no L1 cache misses
- Both the increments of
d.a
suffered 40-50% L1 cache misses.
Same cache lines
- Incrementing
d.a
had no cache misses - Incrementing atomic encountered 100% L1 cache misses.
Could someone please explain this?! I was expecting when the atomic was in the same cache line as d.a
then d.a
would suffer 100% cache misses and when they were in different cache lines d.a
would not be affected.
#include <atomic>
#include <iostream>
#include <iomanip>
#include <vector>
//Structure to ensure not in same cache line
__declspec(align(64)) struct S{
volatile double a,b,d,c,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t;
volatile std::atomic<short> atom;
};
//Structure to ensure same cache line
/*__declspec(align(64)) struct S{
volatile std::atomic<short> atom;
volatile short a;
};*/
int main(){
volatile S d;
for(long long i=0; i<1000000000; i++){
d.a++;
d.atom++;
d.a++;
}
}
UPDATE here is some of the asm:
/* _Atomic_fetch_add_2, _Atomic_fetch_sub_2 */
inline _Uint2_t _Fetch_add_seq_cst_2(volatile _Uint2_t *_Tgt, _Uint2_t _Value)
{ /* add _Value to *_Tgt atomically with
mov word ptr [_Tgt],dx
mov qword ptr [rsp+8],rcx
push rdi
sequentially consistent memory order */
return (_INTRIN_SEQ_CST(_InterlockedExchangeAdd16)((volatile short *)_Tgt, _Value));
movzx eax,word ptr [_Value]
mov rcx,qword ptr [_Tgt]
lock xadd word ptr [rcx],ax
}
pop rdi