I compiled this program in GCC 4.8 -O2
on Skylake 6700HQ.
When I use float
data type, total execution time is 0.000176
sec. When I change the float
to int
, the total time is 0.000026
(~7x faster). I don't know the reason for this difference. Related question: assembly output O3
I use this command in Geany IDE build command gcc -Wall -march=native -O2 -o "%e" "%f"
. I also tried -O3
and -Ofast
, but those do not fix the problem.
I also read this question but there is too much differences between this float
and int
implementation. Since this float
implementation is 7 times slower than the corresponding int
implementation, this is not a duplicate question
#include <stdio.h>
#include <time.h>
float a[32][32]
, t[32][32]
, c_result[32][32]
, c_tra[32][32] ;
int main()
{
int w = 10000;
int i, j, k, temp;
struct timespec tStart, tEnd;
double tTotal , tBest=10000;
do{
clock_gettime(CLOCK_MONOTONIC,&tStart);
for( i = 0; i < 32; i++){
for( j =0 ; j < 32; j++){
temp=0;
for( k = 0 ;k < 32; k++) {
temp += a[i][k] * c_tra[j][k];
}
c_result[i][j]= temp;
}
}
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
}while(w--);
printf(" The best time: %lf sec\n",tBest);
return 0;
}
It is assembly out put for int
data type:
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %esi, %esi
.L2:
xorl %r8d, %r8d
.p2align 4,,10
.p2align 3
.L7:
movq %r8, %rdi
xorl %eax, %eax
xorl %ecx, %ecx
salq $5, %rdi
.p2align 4,,10
.p2align 3
.L5:
movl a(%rsi,%rax), %edx
imull c_tra(%rdi,%rax), %edx
addq $4, %rax
addl %edx, %ecx
cmpq $128, %rax
jne .L5
movl %ecx, c_result(%rsi,%r8)
addq $4, %r8
cmpq $128, %r8
jne .L7
subq $-128, %rsi
cmpq $4096, %rsi
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",@progbits
And this is for float
:
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %ecx, %ecx
.L2:
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L7:
movq %rdi, %rsi
xorl %eax, %eax
xorl %edx, %edx
salq $5, %rsi
.p2align 4,,10
.p2align 3
.L5:
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss a(%rcx,%rax), %xmm2
vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
addq $4, %rax
vcvttss2si %xmm0, %edx
cmpq $128, %rax
jne .L5
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss %xmm0, c_result(%rcx,%rdi)
addq $4, %rdi
cmpq $128, %rdi
jne .L7
subq $-128, %rcx
cmpq $4096, %rcx
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",@progbits