I'm trying to make dot product code with AVX2. However it dosen't work and can't figure out what is wrong with my code. Please help
float vectordot_fma(float *A, float *B, int N)
{
float c = 0.f;
__m256 sum = _mm256_set_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
for (int i = 0; i < N / 8; ++i)
{
__m256 valueA = _mm256_load_ps(A + 8 * i);
__m256 valueB = _mm256_load_ps(B + 8 * i);
__m256 mulAB = _mm256_mul_ps(valueA, valueB);
sum = _mm256_add_ps(sum, mulAB);
}
for (int i = N - N % 8; i < N; ++i)
{
c += A[i] * B[i];
}
__m256 temp = _mm256_hadd_ps(sum, sum);
__m128 sum_high = _mm256_extractf128_ps(temp, 1);
__m128 result = _mm_add_ps(sum_high, _mm256_castps256_ps128(temp));
c = result[0] + c;
return c;
}