I have just begun playing around with my vectorising code. My matrix-vector multiplication code is not being autovectorised by gcc
, I’d like to know why. This pastebin contains the output from -fopt-info-vec-missed
.
I’m having trouble understanding what the output is telling me and seeing how it matches up to what I’ve written in code.
For instance, I see a number of lines saying not enough data-refs in basic block
, I can’t find much detail online with a google search about this. I also see that there’s issues relating to memory alignment e.g. Unknown misalignment, naturally aligned
and vector alignment may not be reachable
. All of my memory allocation was for double
types using malloc
, which I believed was guaranteed to be aligned for that type.
Environment: compiling with gcc
on WSL2
gcc -v: gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define N 4000 // Matrix size will be N x N
#define T 1
//gcc -fopenmp -g vectorisation.c -o main -O3 -march=native -fopt-info-vec-missed=missed.txt
void doParallelComputation(double *A, double *V, double *results, unsigned long matrixSize, int numThreads)
{
omp_set_num_threads(numThreads);
unsigned long i, j;
#pragma omp parallel for simd private(j)
for (i = 0; i < matrixSize; i++)
{
// double *AHead = &A[i * matrixSize];
// double tmp = 0;
for (j = 0; j < matrixSize; j++)
{
results[i] += A[i * matrixSize + j] * V[j];
// also tried tmp += A[i * matrixSize + j] * V[j];
}
// results[i] = tmp;
}
}
void genRandVector(double *S, unsigned long size)
{
srand(time(0));
unsigned long i;
for (i = 0; i < size; i++)
{
double n = rand() % 5;
S[i] = n;
}
}
void genRandMatrix(double *A, unsigned long size)
{
srand(time(0));
unsigned long i, j;
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
double n = rand() % 5;
A[i*size + j] = n;
}
}
}
int main(int argc, char *argv[])
{
double *V = (double *)malloc(N * sizeof(double)); // v in our A*v = parV computation
double *parV = (double *)malloc(N * sizeof(double)); // Parallel computed vector
double *A = (double *)malloc(N * N * sizeof(double)); // NxN Matrix to multiply by V
genRandVector(V, N);
doParallelComputation(A, V, parV, N, T);
free(parV);
free(A);
free(V);
return 0;
}