0

I am trying to vectorise the inner loop the following nested loop. Firstly, is this good practice, or should one avoid attempting to vectorise nested loops?

The following works, it has already some basic loop unrolling.

int sparsemv(struct mesh *A, const double * const x, double * const y) {

  const int nrow = (const int) A->local_nrow;
  int j = 0;
  double sum = 0.0;
  #pragma omp parallel for private(j, sum)
  for (int i=0; i< nrow; i++) {
    sum = 0.0;
    const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
    const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
    const int cur_nnz = (const int) A->nnz_in_row[i];
    int unroll = (cur_nnz/4)*4;
    for (j=0; j< unroll; j+=4) {
      sum += cur_vals[j] * x[cur_inds[j]];
      sum += cur_vals[j+1] * x[cur_inds[j+1]];
      sum += cur_vals[j+2] * x[cur_inds[j+2]];
      sum += cur_vals[j+3] * x[cur_inds[j+3]];
    }
    for (; j < cur_nnz; j++) {
      sum += cur_vals[j] * x[cur_inds[j]];
    }
    y[i] = sum;
  }
  return 0;
}

However, when I try to vectorise using 256-bit Vector registers in AVX2, I get either the incorrect answers or seg faults. x and y are aligned but A is not, but for the moment, all loading and storing is done using unaligned operations since that is the only time I don't get seg faults:

int sparsemv(struct mesh *A, const double * const x, double * const y) {

  const int nrow = (const int) A->local_nrow;
  int j = 0;
  double sum = 0.0;
  #pragma omp parallel for private(j, sum)
  for (int i=0; i< nrow; i++) {
    sum = 0.0;
    const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
    const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
    const int cur_nnz = (const int) A->nnz_in_row[i];
    int unroll = (cur_nnz/4)*4;
    __m256d sumVec = _mm256_set1_pd(sum);
    for (j=0; j< unroll; j+=4) {
      __m256d cur_valsVec = _mm256_loadu_pd(cur_vals + j);
      __m256d xVec = _mm256_loadu_pd(x + cur_inds[j]);
      sumVec = _mm256_add_pd(sumVec, _mm256_mul_pd(cur_valsVec, xVec));
    }
    _mm256_storeu_pd(y + i, sumVec);  // Is this storing in y + i + 1, 2 and 3 aswell?
    for (; j < cur_nnz; j++) {
      sum += cur_vals[j] * x[cur_inds[j]];
    }
    y[i] += sum;
  }
  return 0;
}
nzy
  • 3
  • 3
  • 2
    You need to use a gathered load for x[cur_inds[j]] – Paul R Mar 07 '22 at 10:44
  • @PaulR what would that look like? I am not familiar with gathered loads – nzy Mar 07 '22 at 12:28
  • 1
    Search for `gather` in Intel's intrinsics finder like Paul hinted you should: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html&text=gather&techs=SSE,SSE2,SSE3,SSSE3,SSE4_1,SSE4_2,AVX,AVX2 – Peter Cordes Mar 07 '22 at 13:14
  • 1
    This should work (previous comment was mixing `float` with `double`): `__m256d xVec = _mm256_i32gather_pd(x, _mm_loadu_si128((__m128i)(cur_inds + j)), 8);` -- still untested, though. – chtz Mar 07 '22 at 13:38
  • @chtz I get the following error: ```error: can’t convert value to a vector __m256d xVec = _mm256_i32gather_pd(x, _mm_loadu_si128((__m128i)(cur_inds + j)), 8);``` – nzy Mar 07 '22 at 21:51
  • Try `(__m128i const*)` instead of `(__m128i)` (as I said, I did not test this, mostly because you did not provide a [mre] I could start with ...) – chtz Mar 08 '22 at 10:34
  • @chtz Thank you, it compiles, however still produces the wrong answer :/ I can't really give a mre as it would be pretty big. But thank you for the help anyway. I'll just keep trying. – nzy Mar 08 '22 at 20:26
  • Instead of storing `sumVec` to `y+i` you need to do a horizontal reduction (https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction). – chtz Mar 09 '22 at 03:30

0 Answers0