First I wrote this code:
void dgemm3(double* A, double* B, double* C, int n){
register int i, j, k, n4 = n * 4;
register double cij0, cij1, cij2, cij3;
register double *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3;
for (i=0; i<n; ++i){
for (j=0; j<n; ++j){
a0 = &A[i*n];
a1 = a0 + 1;
a2 = a1 + 1;
a3 = a2 + 1;
b0 = &B[j];
b1 = b0 + n;
b2 = b1 + n;
b3 = b2 + n;
cij0 = cij1 = cij2 = cij3 = 0;
for(k = 0; k < n; k+=4, a0+=4, a1+=4, a2+=4, a3+=4, b0+=n4, b1+=n4, b2+=n4, b3+=n4){
cij0 += *a0 * *b0;
cij1 += *a1 * *b1;
cij2 += *a2 * *b2;
cij3 += *a3 * *b3;
}
*C++ = cij0 + cij1 + cij2 + cij3;
}
}
}
and the I wrote this code using avx:
void dgemm_avx (double* A, double* B, double* C, int n) {
for (int i=0; i<n; i++) {
for (int j=0; j<n; j+=4) {
__m256d c0 = _mm256_setzero_pd();
for (int k=0; k<n; k++) {
__m256d m1 = _mm256_broadcast_sd(A+i*n+k);
__m256d m2 = _mm256_loadu_pd(B+k*n+j);
__m256d m3 = _mm256_mul_pd(m1,m2);
c0 = _mm256_add_pd(c0,m3);
}
_mm256_storeu_pd(C+i*n+j, c0);
}
}
}
I expected the second one to be faster but it is not. But if I use -O1 flag on both of them then the second one is faster. Now my question is shouldn't the second one be faster even without optimization?