Would like to write 256bit of data on one core and read it on another one. So there will be only one process to write and can be multiple readers.
Was thinking to implement it using AVX. The reads and writes should be atomic since they are only 1 instruction (vmovdqa) and if aligned by cache line cache coherency would move the data atomically between cores.
Looked at the generated assembly but can see 2 writes and 2 reads. Why is not there just one? Would this solution for atomic read/write work given the assumptions?
#include <immintrin.h>
#include <cstdint>
struct Data {
int64_t a[4];
};
struct DataHolder {
void set_data(Data* in) {
_mm256_store_si256(reinterpret_cast<__m256i *>(&data_), *reinterpret_cast<__m256i *>(in));
}
void get_data(Data* out) {
_mm256_store_si256(reinterpret_cast<__m256i *>(out), *reinterpret_cast<__m256i *>(&data_));
}
alignas(64) Data data_;
char padding [64 - sizeof(Data)];
};
int main() {
Data a, b;
DataHolder ab;
ab.set_data(&a);
ab.get_data(&b);
}
DataHolder::set_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-80]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-72]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
DataHolder::get_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-72]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-80]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
main:
push rbp
mov rbp, rsp
and rsp, -64
add rsp, -128
lea rdx, [rsp+96]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::set_data(Data*)
lea rdx, [rsp+64]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::get_data(Data*)
mov eax, 0
leave
ret