It seems two functions below can cause segmentation fault when compiled with clang using -mavx (or -march=sandybridge -> skylake).
void _mm256_mul_double_intrin(double* a, double* b, int N)
{
int nb_iters = N / ( sizeof(__m256d) / sizeof(double) );
__m256d* l = (__m256d*)a;
__m256d* r = (__m256d*)b;
for (int i = 0; i < nb_iters; ++i, ++l, ++r)
_mm256_store_pd((double *)l, _mm256_mul_pd(*l, *r));
}
void _mm256_mul_double(double* a, double* b, int N)
{
int nb_iters = N / ( sizeof(__m256d) / sizeof(double) );
__m256d* l = (__m256d*)a;
__m256d* r = (__m256d*)b;
for (int i = 0; i < nb_iters; ++i, ++l, ++r)
__asm__(
"vmulpd %[r], %[l], %[l] \t\n"
: [l] "+x" (*l)
: [r] "m" (*r)
:
);
}
When N is 2 fold or more of 4 (ymm register width / double width), the clang compiled code sometimes cause segmentation fault. (see wandbox link below)
GCC compiled code seems okey.
wandbox.org/permlink/kex4e3lRCKfPAq2J
** I found the original source code here on stackoverflow.com