I am working with the matrix-matrix multiplication and I noticed that if I use 2D arrays, A[M][N], insted of 1D arrays to store a matrix, the access to that takes less time against the locality principle. Here there is the code where the time using linearised matrix is greater than the second one. Why ?
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
void matrixMul(double* m1, double* m2, double* m3, int N) {
int i, j, k;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
m3[i * N + j] += m1[i*N+k] * m2[k*N+j];
}
}
}
}
void matrixlin(double** m1, double** m2, double** m3, int N) {
int i, j, k;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
m3[i][j] += m1[i][k] * m2[k][j];
}
}
}
}
int main(int argc, char* argv[]) {
int N = 1024;
int i, j;
double *m1 = (double *)malloc(N*N*sizeof(double));
double *m2 = (double *)malloc(N*N*sizeof(double));
double *m3 = (double *)malloc(N*N*sizeof(double));
memset(m3, 0, N * N * sizeof(double));
double **mm1 = (double **)malloc(N*sizeof(double*));
double **mm2 = (double **)malloc(N*sizeof(double*));
double **mm3 = (double **)malloc(N*sizeof(double*));
for(i=0; i<N; i++){
mm1[i]=(double *)malloc(N*sizeof(double));
mm2[i]=(double *)malloc(N*sizeof(double));
mm3[i]=(double *)malloc(N*sizeof(double));
memset(mm3[i], 0, N * sizeof(double));
}
for(i=0; i<N; i++){
for(j=0; j<N; j++){
m1[i * N + j] = 1.1; m2[i * N + j]=2.1;
mm1[i][j] = 1.1; mm2[i][j] = 2.1;
}
}
clock_t t1 = clock();
matrixMul(m1, m2, m3, N);
t1 = clock() - t1;
printf("Elapsed time linearized: %.5f seconds\n", ((double)t1)/CLOCKS_PER_SEC);
clock_t t2 = clock();
matrixlin(mm1, mm2, mm3, N);
t2 = clock() - t2;
printf("Elapsed time 2D array: %.5f seconds\n", ((double)t2)/CLOCKS_PER_SEC);
free(m1);
free(m2);
free(m3);
for(i=0; i<N; i++){
free(mm1[i]);
free(mm2[i]);
free(mm3[i]);
}
free(mm1);
free(mm2);
free(mm3);
return 0;
}
Output:
$ gcc test.c
$ ./a.out
Elapsed time linearized: 22.40697 seconds
Elapsed time 2D array: 7.61103 seconds
Assembly code (gcc -S):
.file "test.c"
.text
.globl matrixMul
.type matrixMul, @function
matrixMul:
.LFB6:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -12(%rbp)
jmp .L2
.L7:
movl $0, -8(%rbp)
jmp .L3
.L6:
movl $0, -4(%rbp)
jmp .L4
.L5:
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm1
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -4(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -24(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm2
movl -4(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -32(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm0
mulsd %xmm2, %xmm0
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
addsd %xmm1, %xmm0
movsd %xmm0, (%rax)
addl $1, -4(%rbp)
.L4:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L5
addl $1, -8(%rbp)
.L3:
movl -8(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L6
addl $1, -12(%rbp)
.L2:
movl -12(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L7
nop
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size matrixMul, .-matrixMul
.globl matrixlin
.type matrixlin, @function
matrixlin:
.LFB7:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -12(%rbp)
jmp .L9
.L14:
movl $0, -8(%rbp)
jmp .L10
.L13:
movl $0, -4(%rbp)
jmp .L11
.L12:
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm1
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -24(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -4(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm2
movl -4(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -32(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm0
mulsd %xmm2, %xmm0
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
addsd %xmm1, %xmm0
movsd %xmm0, (%rax)
addl $1, -4(%rbp)
.L11:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L12
addl $1, -8(%rbp)
.L10:
movl -8(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L13
addl $1, -12(%rbp)
.L9:
movl -12(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L14
nop
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE7:
.size matrixlin, .-matrixlin
.section .rodata
.align 8
.LC3:
.string "Elapsed time linearized: %.5f seconds\n"
.align 8
.LC4:
.string "Elapsed time 2D array: %.5f seconds\n"
.text
.globl main
.type main, @function
main:
.LFB8:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $104, %rsp
.cfi_offset 3, -24
movl %edi, -100(%rbp)
movq %rsi, -112(%rbp)
movl $1024, -84(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc@PLT
movq %rax, -80(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc@PLT
movq %rax, -72(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc@PLT
movq %rax, -64(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -64(%rbp), %rax
movl $0, %esi
movq %rax, %rdi
call memset@PLT
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc@PLT
movq %rax, -56(%rbp)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc@PLT
movq %rax, -48(%rbp)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc@PLT
movq %rax, -40(%rbp)
movl $0, -92(%rbp)
jmp .L16
.L17:
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -56(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc@PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -48(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc@PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -40(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc@PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rcx
movq -40(%rbp), %rax
addq %rcx, %rax
movq (%rax), %rax
movl $0, %esi
movq %rax, %rdi
call memset@PLT
addl $1, -92(%rbp)
.L16:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L17
movl $0, -92(%rbp)
jmp .L18
.L21:
movl $0, -88(%rbp)
jmp .L19
.L20:
movl -92(%rbp), %eax
imull -84(%rbp), %eax
movl %eax, %edx
movl -88(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
movsd .LC0(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
imull -84(%rbp), %eax
movl %eax, %edx
movl -88(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
movsd .LC1(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -56(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -88(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd .LC0(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -88(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd .LC1(%rip), %xmm0
movsd %xmm0, (%rax)
addl $1, -88(%rbp)
.L19:
movl -88(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L20
addl $1, -92(%rbp)
.L18:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L21
call clock@PLT
movq %rax, -32(%rbp)
movl -84(%rbp), %ecx
movq -64(%rbp), %rdx
movq -72(%rbp), %rsi
movq -80(%rbp), %rax
movq %rax, %rdi
call matrixMul
call clock@PLT
subq -32(%rbp), %rax
movq %rax, -32(%rbp)
pxor %xmm0, %xmm0
cvtsi2sdq -32(%rbp), %xmm0
movsd .LC2(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
leaq .LC3(%rip), %rdi
movl $1, %eax
call printf@PLT
call clock@PLT
movq %rax, -24(%rbp)
movl -84(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call matrixlin
call clock@PLT
subq -24(%rbp), %rax
movq %rax, -24(%rbp)
pxor %xmm0, %xmm0
cvtsi2sdq -24(%rbp), %xmm0
movsd .LC2(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
leaq .LC4(%rip), %rdi
movl $1, %eax
call printf@PLT
movq -80(%rbp), %rax
movq %rax, %rdi
call free@PLT
movq -72(%rbp), %rax
movq %rax, %rdi
call free@PLT
movq -64(%rbp), %rax
movq %rax, %rdi
call free@PLT
movl $0, -92(%rbp)
jmp .L22
.L23:
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -56(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free@PLT
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free@PLT
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free@PLT
addl $1, -92(%rbp)
.L22:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L23
movq -56(%rbp), %rax
movq %rax, %rdi
call free@PLT
movq -48(%rbp), %rax
movq %rax, %rdi
call free@PLT
movq -40(%rbp), %rax
movq %rax, %rdi
call free@PLT
movl $0, %eax
movq -8(%rbp), %rbx
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE8:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long -1717986918
.long 1072798105
.align 8
.LC1:
.long -858993459
.long 1073794252
.align 8
.LC2:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 10.3.0-1ubuntu1) 10.3.0"
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4: