I am trying to vectorise the inner loop the following nested loop. Firstly, is this good practice, or should one avoid attempting to vectorise nested loops?
The following works, it has already some basic loop unrolling.
int sparsemv(struct mesh *A, const double * const x, double * const y) {
const int nrow = (const int) A->local_nrow;
int j = 0;
double sum = 0.0;
#pragma omp parallel for private(j, sum)
for (int i=0; i< nrow; i++) {
sum = 0.0;
const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
const int cur_nnz = (const int) A->nnz_in_row[i];
int unroll = (cur_nnz/4)*4;
for (j=0; j< unroll; j+=4) {
sum += cur_vals[j] * x[cur_inds[j]];
sum += cur_vals[j+1] * x[cur_inds[j+1]];
sum += cur_vals[j+2] * x[cur_inds[j+2]];
sum += cur_vals[j+3] * x[cur_inds[j+3]];
}
for (; j < cur_nnz; j++) {
sum += cur_vals[j] * x[cur_inds[j]];
}
y[i] = sum;
}
return 0;
}
However, when I try to vectorise using 256-bit Vector registers in AVX2, I get either the incorrect answers or seg faults. x and y are aligned but A is not, but for the moment, all loading and storing is done using unaligned operations since that is the only time I don't get seg faults:
int sparsemv(struct mesh *A, const double * const x, double * const y) {
const int nrow = (const int) A->local_nrow;
int j = 0;
double sum = 0.0;
#pragma omp parallel for private(j, sum)
for (int i=0; i< nrow; i++) {
sum = 0.0;
const double * const cur_vals = (const double * const) A->ptr_to_vals_in_row[i];
const int * const cur_inds = (const int * const) A->ptr_to_inds_in_row[i];
const int cur_nnz = (const int) A->nnz_in_row[i];
int unroll = (cur_nnz/4)*4;
__m256d sumVec = _mm256_set1_pd(sum);
for (j=0; j< unroll; j+=4) {
__m256d cur_valsVec = _mm256_loadu_pd(cur_vals + j);
__m256d xVec = _mm256_loadu_pd(x + cur_inds[j]);
sumVec = _mm256_add_pd(sumVec, _mm256_mul_pd(cur_valsVec, xVec));
}
_mm256_storeu_pd(y + i, sumVec); // Is this storing in y + i + 1, 2 and 3 aswell?
for (; j < cur_nnz; j++) {
sum += cur_vals[j] * x[cur_inds[j]];
}
y[i] += sum;
}
return 0;
}