env : x86-64; linux-centos; 8-cpu-core
For testing 'false sharing performance' I wrote c++ code like this:
volatile int32_t a;
volatile int32_t b;
int64_t p1[7];
volatile int64_t c;
int64_t p2[7];
volatile int64_t d;
void thread1(int param) {
auto start = chrono::high_resolution_clock::now();
for (size_t i = 0; i < 1000000000; ++i) {
a = i % 512;
}
auto end = chrono::high_resolution_clock::now();
cout << " 1 cost:" << chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << endl;
}
void thread2(int param) {
auto start = chrono::high_resolution_clock::now();
for (size_t i = 0; i < 1000000000; ++i) {
b = i % 512;
}
auto end = chrono::high_resolution_clock::now();
cout << " 2 cost:" << chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << endl;
}
void thread3(int param) {
auto start = chrono::high_resolution_clock::now();
for (size_t i = 0; i < 1000000000; ++i) {
c = i % 512;
}
auto end = chrono::high_resolution_clock::now();
cout << " 3 cost:" << chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << endl;
}
void thread4(int param) {
auto start = chrono::high_resolution_clock::now();
for (size_t i = 0; i < 1000000000; ++i) {
d = i % 512;
}
auto end = chrono::high_resolution_clock::now();
cout << " 4 cost:" << chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << endl;
}
here is my compile cmd : g++ xxx.cpp --std=c++11 -O0 -lpthread -g
so there is no opt(O0)
I print a、b、c、d virtual addr are
a addr 0x406200
b addr 0x406204
c addr 0x406258
d addr 0x406298
here is execute result:
4 cost:2186474910
3 cost:6114449628
1 cost:7464439728
2 cost:7469428696
what I understood, there is no 'cache bouncing' Or 'false sharing' problem in thread3 with other thread, so why it's slower than thread 4?
addition: if I change int32_t a,b
to int64_t a,b
,
the result changes to:
a addr 0x4061e0
b addr 0x4061e8
c addr 0x406238
d addr 0x406278
3 cost:2188341526
4 cost:2193782423
2 cost:6479324727
1 cost:6645607256
which is what I predict