0

I'm trying to make dot product code with AVX2. However it dosen't work and can't figure out what is wrong with my code. Please help

float vectordot_fma(float *A, float *B, int N)
{
  float c = 0.f;

  __m256 sum = _mm256_set_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);

  for (int i = 0; i < N / 8; ++i)
  {

    __m256 valueA = _mm256_load_ps(A + 8 * i);
    __m256 valueB = _mm256_load_ps(B + 8 * i);
    __m256 mulAB = _mm256_mul_ps(valueA, valueB);

    sum = _mm256_add_ps(sum, mulAB);
  }

  for (int i = N - N % 8; i < N; ++i)
  {
    c += A[i] * B[i];
  }

  __m256 temp = _mm256_hadd_ps(sum, sum);
  __m128 sum_high = _mm256_extractf128_ps(temp, 1);
  __m128 result = _mm_add_ps(sum_high, _mm256_castps256_ps128(temp));

  c = result[0] + c;

  return c;
}
KSMoon
  • 11
  • 3
  • `haddps` only adds two adjacent values together. See this question for how to do efficient horizontal sums: https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally – chtz Oct 15 '22 at 18:18
  • I changed my code '''c = result[0] + c;''' to '''c = ((float *)&result)[0] + ((float *)&result)[2] + c;''' Am I doing right? It still doesn't work... – KSMoon Oct 17 '22 at 09:21
  • The easiest solution is to call `hadd` twice (`sum = _mm256_hadd_ps(sum, sum); sum = _mm256_hadd_ps(sum, sum);` before extracting the high part and adding it to the low part. See the linked question for more efficient ways to do it. – chtz Oct 17 '22 at 09:45

0 Answers0