The question is about if the correct code (here only summing V1,V2, potentially having N~10 elements, which are not memory contiguous and have a relevant length each of 3) is the best-performant way to perform vectorized accumulation given that the variable to accumulate if not read from contiguous chunks of memory (and the hardware is Intel SkyLake).
In particular, should it be faster than actually accumulating the product with e.g. horizontal sum? (i.e. parallelizing in the 'other' dimension)
A working minimal code follows
#include <x86intrin.h>
#include <iostream>
int main(){
// Initialization
double coefficients[50][2][12];
for (int i=0; i<4; i++) coefficients[0][0][i] = (double) 1; // C1_0 is 1
for (int i=0; i<4; i++) coefficients[0][0][4+i] = (double) 2; // C2_0 is 2
for (int i=0; i<4; i++) coefficients[0][0][8+i] = (double) 3; // C3_0 is 3
for (int i=0; i<4; i++) coefficients[0][1][i] = (double) 4; // C1_1 is 4
for (int i=0; i<4; i++) coefficients[0][1][4+i] = (double) 5; // C2_1 is 5
for (int i=0; i<4; i++) coefficients[0][1][8+i] = (double) 6; // C3_2 is 6
int ix1 = 13;
int ix2 = 27;
double u[50][3];
u[ix1][0] = 1.0;
u[ix1][1] = 2.0;
u[ix1][2] = 3.0;
u[ix2][0] = 4.0;
u[ix2][1] = 5.0;
u[ix2][2] = 6.0;
// Step 1: load data
//
__m256d V1 = _mm256_load_pd(&u[ix1][0]);
__m256d V2 = _mm256_load_pd(&u[ix2][0]);
// Populate the vectorised coefficients
__m256d Vcoeff_A_1 = _mm256_load_pd( &coefficients[0][0][0]);
__m256d Vcoeff_B_1 = _mm256_load_pd( &coefficients[0][0][4]);
__m256d Vcoeff_C_1 = _mm256_load_pd( &coefficients[0][0][8]);
__m256d Vcoeff_A_2 = _mm256_load_pd( &coefficients[0][1][0]);
__m256d Vcoeff_B_2 = _mm256_load_pd( &coefficients[0][1][4]);
__m256d Vcoeff_C_2 = _mm256_load_pd( &coefficients[0][1][8]);
// The accumulation refers to
//
// resultA[i] = V1[i] * Vcoeff_A_1[i] + V2[i] * Vcoeff_A_2[i] + ... + potentially defining other vectors V3,V4,V5,...
//
__m256d resultA = _mm256_fmadd_pd( V1, Vcoeff_A_1, _mm256_mul_pd(V2, Vcoeff_A_2));
__m256d resultB = _mm256_fmadd_pd( V1, Vcoeff_B_1, _mm256_mul_pd(V2, Vcoeff_B_2));
__m256d resultC = _mm256_fmadd_pd( V1, Vcoeff_C_1, _mm256_mul_pd(V2, Vcoeff_C_2));
}
The following is an example on how it would look an accumulation of size 12 (it does not compile)
__m256d resultB = _mm256_fmadd_pd( Vu_0, Vcoeff_0_B256, // 0
_mm256_fmadd_pd( Vu_1, Vcoeff_1_B256, // 1
_mm256_fmadd_pd( Vu_2, Vcoeff_2_B256, // 2
_mm256_fmadd_pd( Vu_3, Vcoeff_3_B256, // 3
_mm256_fmadd_pd( Vu_4, Vcoeff_4_B256, // 4
_mm256_fmadd_pd( Vu_5, Vcoeff_5_B256, // 5
_mm256_fmadd_pd( Vu_6, Vcoeff_6_B256, // 6
_mm256_fmadd_pd( Vu_7, Vcoeff_7_B256, // 7
_mm256_fmadd_pd( Vu_8, Vcoeff_8_B256, // 8
_mm256_fmadd_pd( Vu_9, Vcoeff_9_B256, // 9
_mm256_fmadd_pd( Vu_10, Vcoeff_10_B256, // 10
_mm256_mul_pd( Vu_11, Vcoeff_11_B256 ) // 11
)))))))))));