2

I am working with the matrix-matrix multiplication and I noticed that if I use 2D arrays, A[M][N], insted of 1D arrays to store a matrix, the access to that takes less time against the locality principle. Here there is the code where the time using linearised matrix is greater than the second one. Why ?

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>

void matrixMul(double* m1, double* m2, double* m3, int N) {
    int i, j, k;
    
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            for (k = 0; k < N; k++) { 
                m3[i * N + j] += m1[i*N+k] * m2[k*N+j];
            }
        }
    }
}

void matrixlin(double** m1, double** m2, double** m3, int N) {
    int i, j, k;
    
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            for (k = 0; k < N; k++) { 
                m3[i][j] += m1[i][k] * m2[k][j];
            }
        }
    }
}

int main(int argc, char* argv[]) {
    int N = 1024;
    int i, j;
    
    double *m1 = (double *)malloc(N*N*sizeof(double));
    double *m2 = (double *)malloc(N*N*sizeof(double));
    double *m3 = (double *)malloc(N*N*sizeof(double));
    memset(m3, 0, N * N * sizeof(double)); 
    

    double **mm1 = (double **)malloc(N*sizeof(double*));
    double **mm2 = (double **)malloc(N*sizeof(double*));
    double **mm3 = (double **)malloc(N*sizeof(double*));
    for(i=0; i<N; i++){
        mm1[i]=(double *)malloc(N*sizeof(double));
        mm2[i]=(double *)malloc(N*sizeof(double));
        mm3[i]=(double *)malloc(N*sizeof(double));
        memset(mm3[i], 0,  N * sizeof(double)); 
    }
    
    for(i=0; i<N; i++){
        for(j=0; j<N; j++){
            m1[i * N + j] = 1.1; m2[i * N + j]=2.1;
            mm1[i][j] = 1.1; mm2[i][j] = 2.1;
        }
    }
    

    clock_t t1 = clock();
    matrixMul(m1, m2, m3, N);
    t1 = clock() - t1;
    printf("Elapsed time linearized: %.5f seconds\n", ((double)t1)/CLOCKS_PER_SEC);

    
    clock_t t2 = clock();
    matrixlin(mm1, mm2, mm3, N);
    t2 = clock() - t2;
    printf("Elapsed time 2D array: %.5f seconds\n", ((double)t2)/CLOCKS_PER_SEC);
    
    
    free(m1);
    free(m2);
    free(m3);
    for(i=0; i<N; i++){
        free(mm1[i]);
        free(mm2[i]);
        free(mm3[i]);
    }
    free(mm1);
    free(mm2);
    free(mm3);
    return 0;
}

Output:

$ gcc test.c
$ ./a.out
Elapsed time linearized: 22.40697 seconds
Elapsed time 2D array: 7.61103 seconds

Assembly code (gcc -S):

    .file   "test.c"
    .text
    .globl  matrixMul
    .type   matrixMul, @function
matrixMul:
.LFB6:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -12(%rbp)
    jmp .L2
.L7:
    movl    $0, -8(%rbp)
    jmp .L3
.L6:
    movl    $0, -4(%rbp)
    jmp .L4
.L5:
    movl    -12(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -8(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm1
    movl    -12(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -4(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -24(%rbp), %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm2
    movl    -4(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -8(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -32(%rbp), %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm0
    mulsd   %xmm2, %xmm0
    movl    -12(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -8(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    addsd   %xmm1, %xmm0
    movsd   %xmm0, (%rax)
    addl    $1, -4(%rbp)
.L4:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L5
    addl    $1, -8(%rbp)
.L3:
    movl    -8(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L6
    addl    $1, -12(%rbp)
.L2:
    movl    -12(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L7
    nop
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE6:
    .size   matrixMul, .-matrixMul
    .globl  matrixlin
    .type   matrixlin, @function
matrixlin:
.LFB7:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -12(%rbp)
    jmp .L9
.L14:
    movl    $0, -8(%rbp)
    jmp .L10
.L13:
    movl    $0, -4(%rbp)
    jmp .L11
.L12:
    movl    -12(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -8(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm1
    movl    -12(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -24(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -4(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm2
    movl    -4(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -32(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -8(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm0
    mulsd   %xmm2, %xmm0
    movl    -12(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -8(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    addsd   %xmm1, %xmm0
    movsd   %xmm0, (%rax)
    addl    $1, -4(%rbp)
.L11:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L12
    addl    $1, -8(%rbp)
.L10:
    movl    -8(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L13
    addl    $1, -12(%rbp)
.L9:
    movl    -12(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L14
    nop
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE7:
    .size   matrixlin, .-matrixlin
    .section    .rodata
    .align 8
.LC3:
    .string "Elapsed time linearized: %.5f seconds\n"
    .align 8
.LC4:
    .string "Elapsed time 2D array: %.5f seconds\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB8:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %rbx
    subq    $104, %rsp
    .cfi_offset 3, -24
    movl    %edi, -100(%rbp)
    movq    %rsi, -112(%rbp)
    movl    $1024, -84(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -80(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -72(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -64(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -64(%rbp), %rax
    movl    $0, %esi
    movq    %rax, %rdi
    call    memset@PLT
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -56(%rbp)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -48(%rbp)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -40(%rbp)
    movl    $0, -92(%rbp)
    jmp .L16
.L17:
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movl    -92(%rbp), %edx
    movslq  %edx, %rdx
    leaq    0(,%rdx,8), %rcx
    movq    -56(%rbp), %rdx
    leaq    (%rcx,%rdx), %rbx
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, (%rbx)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movl    -92(%rbp), %edx
    movslq  %edx, %rdx
    leaq    0(,%rdx,8), %rcx
    movq    -48(%rbp), %rdx
    leaq    (%rcx,%rdx), %rbx
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, (%rbx)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movl    -92(%rbp), %edx
    movslq  %edx, %rdx
    leaq    0(,%rdx,8), %rcx
    movq    -40(%rbp), %rdx
    leaq    (%rcx,%rdx), %rbx
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, (%rbx)
    movl    -84(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rcx
    movq    -40(%rbp), %rax
    addq    %rcx, %rax
    movq    (%rax), %rax
    movl    $0, %esi
    movq    %rax, %rdi
    call    memset@PLT
    addl    $1, -92(%rbp)
.L16:
    movl    -92(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L17
    movl    $0, -92(%rbp)
    jmp .L18
.L21:
    movl    $0, -88(%rbp)
    jmp .L19
.L20:
    movl    -92(%rbp), %eax
    imull   -84(%rbp), %eax
    movl    %eax, %edx
    movl    -88(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -80(%rbp), %rax
    addq    %rdx, %rax
    movsd   .LC0(%rip), %xmm0
    movsd   %xmm0, (%rax)
    movl    -92(%rbp), %eax
    imull   -84(%rbp), %eax
    movl    %eax, %edx
    movl    -88(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -72(%rbp), %rax
    addq    %rdx, %rax
    movsd   .LC1(%rip), %xmm0
    movsd   %xmm0, (%rax)
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -56(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -88(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   .LC0(%rip), %xmm0
    movsd   %xmm0, (%rax)
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -48(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -88(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   .LC1(%rip), %xmm0
    movsd   %xmm0, (%rax)
    addl    $1, -88(%rbp)
.L19:
    movl    -88(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L20
    addl    $1, -92(%rbp)
.L18:
    movl    -92(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L21
    call    clock@PLT
    movq    %rax, -32(%rbp)
    movl    -84(%rbp), %ecx
    movq    -64(%rbp), %rdx
    movq    -72(%rbp), %rsi
    movq    -80(%rbp), %rax
    movq    %rax, %rdi
    call    matrixMul
    call    clock@PLT
    subq    -32(%rbp), %rax
    movq    %rax, -32(%rbp)
    pxor    %xmm0, %xmm0
    cvtsi2sdq   -32(%rbp), %xmm0
    movsd   .LC2(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movq    %xmm0, %rax
    movq    %rax, %xmm0
    leaq    .LC3(%rip), %rdi
    movl    $1, %eax
    call    printf@PLT
    call    clock@PLT
    movq    %rax, -24(%rbp)
    movl    -84(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    matrixlin
    call    clock@PLT
    subq    -24(%rbp), %rax
    movq    %rax, -24(%rbp)
    pxor    %xmm0, %xmm0
    cvtsi2sdq   -24(%rbp), %xmm0
    movsd   .LC2(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movq    %xmm0, %rax
    movq    %rax, %xmm0
    leaq    .LC4(%rip), %rdi
    movl    $1, %eax
    call    printf@PLT
    movq    -80(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -72(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -64(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    $0, -92(%rbp)
    jmp .L22
.L23:
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -56(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -48(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    free@PLT
    addl    $1, -92(%rbp)
.L22:
    movl    -92(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L23
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -48(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -40(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    $0, %eax
    movq    -8(%rbp), %rbx
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE8:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   -1717986918
    .long   1072798105
    .align 8
.LC1:
    .long   -858993459
    .long   1073794252
    .align 8
.LC2:
    .long   0
    .long   1093567616
    .ident  "GCC: (Ubuntu 10.3.0-1ubuntu1) 10.3.0"
    .section    .note.GNU-stack,"",@progbits
    .section    .note.gnu.property,"a"
    .align 8
    .long    1f - 0f
    .long    4f - 1f
    .long    5
0:
    .string  "GNU"
1:
    .align 8
    .long    0xc0000002
    .long    3f - 2f
2:
    .long    0x3
3:
    .align 8
4:
Dresult
  • 171
  • 1
  • 11
  • I compiled and ran your code with gcc 5.5.0 and twice my results were almost identical, with 2D access being slightly slower than linearized. You should post what OS and what version of gcc you are using. Finally, it may be worth while to run gcc -S to look at the assembly that is produced, and see what the difference is. – Lev M. Jun 26 '21 at 14:34
  • 1
    In the first one, the compiler might have less ability to optimize the array indexing. It might not realise that you are simulating a 2D array. In the 2D example, there is a clear opportunity for the compiler to make temporary pointers between the loops, so that the inner loop is a 1D array access. – Weather Vane Jun 26 '21 at 14:35
  • Do you see the same time differences when compiling with optimization level 2 or even 3? – Sergio Monteleone Jun 26 '21 at 14:38
  • @LevM. gcc version 10.3.0 (Ubuntu 10.3.0-1ubuntu1) - Ubuntu 21.04. I've added the assembly code above. – Dresult Jun 26 '21 at 14:44
  • With such timing tests I like to alternate the tests several times, perhaps in a loop, rather than simply do one before the other. – Weather Vane Jun 26 '21 at 14:46
  • @SergioMonteleone The times are always different even if they are lower than before. – Dresult Jun 26 '21 at 14:46
  • *Why accessing 2d array takes less than 1d array?* Your code doesn't contain any 2d arrays. It contains 1d arrays of `double` (`m1`, etc), and 1d arrays of pointers that point to separate 1d arrays (`mm1`, etc). Anytime you see nested `malloc()` calls you are not using n-dimensional arrays - you are using 1d arrays of pointers to multiple and separate 1d arrays. See [**Correctly allocating multi-dimensional arrays**](https://stackoverflow.com/questions/42094465/correctly-allocating-multi-dimensional-arrays) – Andrew Henle Jun 26 '21 at 23:25
  • @AndrewHenle Oh, that's right. Thanks. However I don't understand why there is such a differench between these two approaces... – Dresult Jun 27 '21 at 08:51

0 Answers0