1

This is actually a program provided by my lecturer for comparing the efficiencies of avx method vs traditional method on matrix calculation. I was able to run this program well in a Macbook air 2015 13'. However, I wasn't able to run this program in manjaro linux as it will pop up this error:

~/Documents >>> gcc -O3 -mavx dgemm_avx.c -o dgemm_avx
~/Documents >>> ./dgemm_avx           
Enter the matrix width: 1024
zsh: segmentation fault (core dumped)  ./dgemm_avx

Below is the code:

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <immintrin.h>

void dgemm_avx(int n, double *A, double *B, double *C)
{
    int i, j, k;
    for (i = 0; i < n; i++)
        for (j = 0; j < n; j += 4)
        {
            __m256d c0 = _mm256_load_pd(C + i * n + j);
            for (k = 0; k < n; k++)
            {
                __m256d vA = _mm256_broadcast_sd(A + k + i * n);
                __m256d vB = _mm256_load_pd(B + j + k * n);
                __m256d vC = _mm256_mul_pd(vA, vB);
                c0 = _mm256_add_pd(c0, vC);
            }

            _mm256_store_pd(C + j + i * n, c0);
        }
}

void initialize(int n, double *X)
{
    int i;
    for (i = 0; i < n * n; i++)
        X[i] = (double)rand() / (double)RAND_MAX;
}

int main()
{
    int n;
    printf("Enter the matrix width: ");
    scanf("%d", &n);

    double *A = (double *)malloc(n * n * sizeof(double));
    double *B = (double *)malloc(n * n * sizeof(double));
    double *C = (double *)malloc(n * n * sizeof(double));

    srand(1);
    initialize(n, A);
    initialize(n, B);
    memset(C, 0, n * n * sizeof(double));

    struct timeval st, et;
    gettimeofday(&st, NULL);

    dgemm_avx(n, A, B, C);

    gettimeofday(&et, NULL);

    float elapsed = ((float)(et.tv_sec - st.tv_sec)) + (float)(et.tv_usec - st.tv_usec) * 0.000001f;

    printf("Calculation time for %d x %d matrix: %0.6f seconds\n", n, n, elapsed);
    return 0;
}

Thanks in advance.

Chris
  • 11
  • 1
  • malloc doesn't give you 32-byte aligned memory. If you compile without optimization, you'll get an actual `vmovaps` that requires alignment (because you used `_mm256_load_pd`), instead of a load folded into a memory source for `vmulps`. Also a vmovaps for the `c0` load. I assume on Mac you just happened to get 32-byte aligned memory. – Peter Cordes Nov 24 '20 at 06:53

0 Answers0