On most desktop and server architectures branching is faster than indirect calls, since they do branch prediction and/or speculative execution. I have never heard of an architecture where indirect call is faster than a single branch. (Jump tables, for switch()
statements, have more than one branch, and are therefore a different thing altogether.)
Consider the following microbenchmark I threw together. test.c
:
/* test.c */
volatile long test_calls = 0L;
volatile long test_sum = 0L;
void test(long counter)
{
test_calls++;
test_sum += counter;
}
work.c
:
/* work.c */
void test(long counter);
/* Work function, to be measured */
void test_work(long counter, int flag)
{
if (flag)
test(counter);
}
/* Dummy function, to measure call overhead */
void test_none(long counter __attribute__((unused)), int flag __attribute__((unused)) )
{
return;
}
and harness.c
:
#define _POSIX_C_SOURCE 200809L
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
/* From test.c */
extern volatile long test_calls;
extern volatile long test_sum;
/* Dummy function, to measure call overhead */
void test_none(long counter, int flag);
/* Work function, to be measured */
void test_work(long counter, int flag);
/* Timing harness -- GCC x86; modify for other architectures */
struct timing {
struct timespec wall_start;
struct timespec wall_stop;
uint64_t cpu_start;
uint64_t cpu_stop;
};
static inline void start_timing(struct timing *const mark)
{
clock_gettime(CLOCK_REALTIME, &(mark->wall_start));
mark->cpu_start = __builtin_ia32_rdtsc();
}
static inline void stop_timing(struct timing *const mark)
{
mark->cpu_stop = __builtin_ia32_rdtsc();
clock_gettime(CLOCK_REALTIME, &(mark->wall_stop));
}
static inline double cpu_timing(const struct timing *const mark)
{
return (double)(mark->cpu_stop - mark->cpu_start); /* Cycles */
}
static inline double wall_timing(const struct timing *const mark)
{
return (double)(mark->wall_stop.tv_sec - mark->wall_start.tv_sec)
+ (double)(mark->wall_stop.tv_nsec - mark->wall_start.tv_nsec) / 1000000000.0;
}
static int cmpdouble(const void *aptr, const void *bptr)
{
const double a = *(const double *)aptr;
const double b = *(const double *)bptr;
if (a < b)
return -1;
else
if (a > b)
return +1;
else
return 0;
}
void report(double *const wall, double *const cpu, const size_t count)
{
printf("\tInitial call: %.0f cpu cycles, %.9f seconds real time\n", cpu[0], wall[0]);
qsort(wall, count, sizeof (double), cmpdouble);
qsort(cpu, count, sizeof (double), cmpdouble);
printf("\tMinimum: %.0f cpu cycles, %.9f seconds real time\n", cpu[0], wall[0]);
printf("\t5%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count/20], wall[count/20]);
printf("\t25%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count/4], wall[count/4]);
printf("\tMedian: %.0f cpu cycles, %.9f seconds real time\n", cpu[count/2], wall[count/2]);
printf("\t75%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count-count/4-1], wall[count-count/4-1]);
printf("\t95%% less than %.0f cpu cycles, %.9f seconds real time\n", cpu[count-count/20-1], wall[count-count/20-1]);
printf("\tMaximum: %.0f cpu cycles, %.9f seconds real time\n", cpu[count-1], wall[count-1]);
}
int main(int argc, char *argv[])
{
struct timing measurement;
double *wall_seconds = NULL;
double *cpu_cycles = NULL;
unsigned long count = 0UL;
unsigned long i;
int flag;
char dummy;
if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s COUNT FLAG\n", argv[0]);
fprintf(stderr, "\n");
return 1;
}
if (sscanf(argv[1], " %lu %c", &count, &dummy) != 1) {
fprintf(stderr, "%s: Invalid COUNT.\n", argv[1]);
return 1;
}
if (count < 1UL) {
fprintf(stderr, "%s: COUNT is too small.\n", argv[1]);
return 1;
}
if (!(unsigned long)(count + 1UL)) {
fprintf(stderr, "%s: COUNT is too large.\n", argv[1]);
return 1;
}
if (sscanf(argv[2], " %d %c", &flag, &dummy) != 1) {
fprintf(stderr, "%s: Invalid FLAG.\n", argv[2]);
return 1;
}
wall_seconds = malloc(sizeof (double) * (size_t)count);
cpu_cycles = malloc(sizeof (double) * (size_t)count);
if (!wall_seconds || !cpu_cycles) {
free(cpu_cycles);
free(wall_seconds);
fprintf(stderr, "Cannot allocate enough memory. Try smaller COUNT.\n");
return 1;
}
printf("Call and measurement overhead:\n");
fflush(stdout);
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_none(i, flag);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
printf("Measuring FLAG==0 calls: ");
fflush(stdout);
test_calls = 0L;
test_sum = 0L;
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_work(i, 0);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
printf("%ld calls, sum %ld.\n", test_calls, test_sum);
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
printf("Measuring FLAG==%d calls:", flag);
fflush(stdout);
test_calls = 0L;
test_sum = 0L;
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_work(i, flag);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
printf("%ld calls, sum %ld.\n", test_calls, test_sum);
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
printf("Measuring alternating FLAG calls: ");
fflush(stdout);
test_calls = 0L;
test_sum = 0L;
for (i = 0UL; i < count; i++) {
start_timing(&measurement);
test_work(i, i & 1);
stop_timing(&measurement);
wall_seconds[i] = wall_timing(&measurement);
cpu_cycles[i] = cpu_timing(&measurement);
}
printf("%ld calls, sum %ld.\n", test_calls, test_sum);
report(wall_seconds, cpu_cycles, (size_t)count);
printf("\n");
free(cpu_cycles);
free(wall_seconds);
return 0;
}
Put the three files in an empty directory, then compile and build ./call-test
:
rm -f *.o
gcc -W -Wall -O3 -fomit-frame-pointer -c harness.c
gcc -W -Wall -O3 -fomit-frame-pointer -c work.c
gcc -W -Wall -O3 -fomit-frame-pointer -c test.c
gcc harness.o work.o test.o -lrt -o call-test
On AMD Athlon II X4 640, using gcc-4.6.3 (Xubuntu 10.04), running
./call-test 1000000 1
tells me that the overhead is just 2 clock cycles (< 1ns) for the test alone (branch not taken), and just 4 clock cycles (just over a nanosecond) when calling the second function which increases test_calls
and adds the counter to test_sum
.
When omitting all optimizations (use -O0
and leave out -fomit-frame-pointer
when compiling), the test alone costs about 3 clock cycles (3 cycles if branch not taken), and about 9 cycles if the branch is taken and the work is done to update the two extra variables.
(The two extra variables let you easily see that the harness does actually do all it should do; they're just an extra check. And I wanted to have some work in the second function, so the timing differences would be easier to spot.)
The above interpretation is only valid for the case when the code is already cached; i.e. run recently. If the code is run only rarely, it won't be in cache. However, then the test overhead matters even less. Caching effects -- for example, if "nearby" code has been run (you can see this for the call overhead measurement, the other test functions code' tends to get cached too!) -- are much larger anyway. (While the test harness does produce the initial call results separately, don't put too much faith in it, since it does not try to clear any caches in any way.)
My conclusion is that adding
if (flag)
debug_function_call();
to any normal code is perfectly fine: the overhead is literally neglible; practically irrelevant. As always, consider the overall algorithm instead. Any enhancements in the algorithm yield much bigger rewards than worrying about the code the compiler generates.
(Since I wrote the test code above at one sitting, there are likely some bugs and/or brainfarts in them. Check, and if you find any, let me know below so I can fix the code.)