I'm trying to implement an atomic copy for multiple data elements between CPUs. I packed multiple elements of data into a single cacheline to manipulate them atomically. So I wrote the following code.
In this code, (compiled with -O3) I aligned a global struct data into a single cacheline, and I set the elements in a CPU followed by a store barrier. It is to make globally visible from the other CPU.
At the same time, in the other CPU, I used an load barrier to access the cacheline atomically. My expectation was that the reader (or consumer) CPU should bring a cache line of data into the its own cache hierarchy L1, L2 etc.. So, since I do not use load barrier again until the next read, the elements of the data would be the same, but it does not work as expected. I can't keep the cacheline atomicity in this code. The writer CPU seems putting elements into the cacheline piece by piece. How could it be possible?
#include <emmintrin.h>
#include <pthread.h>
#include "common.h"
#define CACHE_LINE_SIZE 64
struct levels {
uint32_t x1;
uint32_t x2;
uint32_t x3;
uint32_t x4;
uint32_t x5;
uint32_t x6;
uint32_t x7;
} __attribute__((aligned(CACHE_LINE_SIZE)));
struct levels g_shared;
void *worker_loop(void *param)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(15, &cpuset);
pthread_t thread = pthread_self();
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
struct levels shared;
while (1) {
_mm_lfence();
shared = g_shared;
if (shared.x1 != shared.x7) {
printf("%u %u %u %u %u %u %u\n",
shared.x1, shared.x2, shared.x3, shared.x4, shared.x5, shared.x6, shared.x7);
exit(EXIT_FAILURE);
}
}
return NULL;
}
int main(int argc, char *argv[])
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(16, &cpuset);
pthread_t thread = pthread_self();
memset(&g_shared, 0, sizeof(g_shared));
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
pthread_t worker;
int istatus = pthread_create(&worker, NULL, worker_loop, NULL);
fatal_elog_if(istatus != 0);
uint32_t val = 0;
while (1) {
g_shared.x1 = val;
g_shared.x2 = val;
g_shared.x3 = val;
g_shared.x4 = val;
g_shared.x5 = val;
g_shared.x6 = val;
g_shared.x7 = val;
_mm_sfence();
// _mm_clflush(&g_shared);
val++;
}
return EXIT_SUCCESS;
}
The output is like below
3782063 3782063 3782062 3782062 3782062 3782062 3782062
UPDATE 1
I updated the code as below using AVX512, but the problem is still here.
#include <emmintrin.h>
#include <pthread.h>
#include "common.h"
#include <immintrin.h>
#define CACHE_LINE_SIZE 64
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static inline __attribute__((always_inline)) void
mov64(uint8_t *dst, const uint8_t *src)
{
__m512i zmm0;
zmm0 = _mm512_load_si512((const void *)src);
_mm512_store_si512((void *)dst, zmm0);
}
struct levels {
uint32_t x1;
uint32_t x2;
uint32_t x3;
uint32_t x4;
uint32_t x5;
uint32_t x6;
uint32_t x7;
} __attribute__((aligned(CACHE_LINE_SIZE)));
struct levels g_shared;
void *worker_loop(void *param)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(15, &cpuset);
pthread_t thread = pthread_self();
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
struct levels shared;
while (1) {
mov64((uint8_t *)&shared, (uint8_t *)&g_shared);
// shared = g_shared;
if (shared.x1 != shared.x7) {
printf("%u %u %u %u %u %u %u\n",
shared.x1, shared.x2, shared.x3, shared.x4, shared.x5, shared.x6, shared.x7);
exit(EXIT_FAILURE);
} else {
printf("%u %u\n", shared.x1, shared.x7);
}
}
return NULL;
}
int main(int argc, char *argv[])
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(16, &cpuset);
pthread_t thread = pthread_self();
memset(&g_shared, 0, sizeof(g_shared));
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
pthread_t worker;
int istatus = pthread_create(&worker, NULL, worker_loop, NULL);
fatal_elog_if(istatus != 0);
uint32_t val = 0;
while (1) {
g_shared.x1 = val;
g_shared.x2 = val;
g_shared.x3 = val;
g_shared.x4 = val;
g_shared.x5 = val;
g_shared.x6 = val;
g_shared.x7 = val;
_mm_sfence();
// _mm_clflush(&g_shared);
val++;
}
return EXIT_SUCCESS;
}