I was reading through Agner's list of assembly codes for x86 and x87 and noticed that there is no op-code for arcsin or arccos, but only arctan. So I've googled it and all the results implemented it by using atan and sqrt, which would mean that acos and asin should be significantly slower than atan because you need an additional sqrt, but I wrote a simple test program in C++ and acos and asin are both faster than atan:
#include <chrono>
#include <cmath>
#include <iostream>
class timer {
private:
decltype(std::chrono::high_resolution_clock::now()) begin, end;
public:
void
start() {
begin = std::chrono::high_resolution_clock::now();
}
void
stop() {
end = std::chrono::high_resolution_clock::now();
}
template<typename T>
auto
duration() const {
return std::chrono::duration_cast<T>(end - begin).count();
}
auto
nanoseconds() const {
return duration<std::chrono::nanoseconds>();
}
void
printNS(char const* str) const {
std::cout << str << ": " << nanoseconds() << std::endl;
}
};
int
main(int argc, char**) {
timer timer;
double p1 = 0 + 0.000000001;
double acc1{1};
timer.start();
//less than 8 seconds
for(int i{0}; 200000000 > i; ++i) {
acc1 += std::acos(i * p1);
}
timer.stop();
timer.printNS("acos");
timer.start();
//less than 8 seconds
for(int i{0}; 200000000 > i; ++i) {
acc1 += std::asin(i * p1);
}
timer.stop();
timer.printNS("asin");
timer.start();
//more than 12 seconds
for(int i{0}; 200000000 > i; ++i) {
acc1 += std::atan(i * p1);
}
timer.stop();
timer.printNS("atan");
timer.start();
//almost 20 seconds
for(int i{0}; 200000000 > i; ++i) {
acc1 += std::atan2(i * p1, i * p1);
}
timer.stop();
timer.printNS("atan");
std::cout << acc1 << '\n';
}
I've tried seeing the assembly on godbolt, but it doesn't inline acos or asin.
So how is it implemented or if it actually just uses atan, how can it be faster?