I'm a newcomer to assembly languages. I've written two strcpy implementations using masm; one uses rsi and rdi, and another does not. The latter one costs less time. It seems that it is recommended to use rsi and rdi to copy data, and the latter one has bigger looping part than the former one. But when I measured the performance, the former one costed more time. Why the former one costs more time, and what is the recommended way(recommended instructions or registers) to handle strings in x86-64 assembly?
strcpy using rsi and rdi:
custom_strcpy proc
mov rsi, rdx
mov rdi, rcx
mov rax, rdi
_loop:
movsb
mov r8d, [rsi]
cmp r8d, 0
jne _loop
_end:
mov byte ptr[rdi], 0
ret
custom_strcpy endp
strcpy not using rsi and rdi:
custom_strcpy proc
mov rax, rcx
_loop:
mov r8b, byte ptr[rdx]
mov byte ptr[rcx], r8b
inc rcx
inc rdx
cmp r8b, 0
jne _loop
ret
custom_strcpy endp
C++ code I used to measure the performance:
#include <iostream>
#include <chrono>
#include <cstring>
#define TIMES 100000000
using namespace std;
using namespace std::chrono;
extern "C" char * custom_strcpy(char * dst, const char * src);
extern "C" void foo()
{
char src[] = "Hello, world!";
char dst[sizeof(src)];
auto start = high_resolution_clock::now();
for (int i = 0; i < TIMES; i++)
{
strcpy(dst, src);
}
auto end = high_resolution_clock::now();
cout << duration_cast<duration<double>>(end - start).count() << endl;
start = high_resolution_clock::now();
for (int i = 0; i < TIMES; i++)
{
custom_strcpy(dst, src);
}
end = high_resolution_clock::now();
cout << duration_cast<duration<double>>(end - start).count() << endl;
}