In copyij
you have
src[i][j] =+ 1;
But I think you meant
src[i][j] += 1;
Measuring Time
For keeping track of elapsed time, I recommend clock_gettime
with a monotonic clock CLOCK_MONOTONIC
, or CLOCK_MONOTONIC_COARSE
if your system has it.
To find the time delta, you can use the following macro (from OpenBSD's sys/time.h
, see the manual timespecsub(3)
):
#define timespecsub(tsp, usp, vsp) \
do \
{ \
(vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \
(vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \
if ((vsp)->tv_nsec < 0) \
{ \
(vsp)->tv_sec--; \
(vsp)->tv_nsec += 1000000000L; \
} \
} while (0)
For example:
struct timespec start, end, delta;
timespecsub(&end, &start, &delta);
// delta contains the time difference from start to end
If you just want to benchmark the copy functions, then dividing by NNN
doesn't make sense unless you want to benchmark the per-dim performance (row-wise for ij, column-wise for ji).
To print struct timespec
you can use %lld
for tv_sec
and %.9ld
for tv_nsec
.
Compiler Optimizations
To ensure the compiler doesn't optimize away your functions:
- Use
volatile
for the matrices. This prevents the compiler from optimizing away operations on them.
static volatile int src[NNN][NNN], dst[NNN][NNN];
- For GCC, use a compiler directive such as
#pragma GCC optimize("O0")
around the copy functions to prevent GCC optimizing the loops. See here for alternatives.
#pragma GCC push_options
#pragma GCC optimize("O0")
static void
copyij()
{
// ...
}
static void
copyji()
{
// ...
}
#pragma GCC pop_options
Without these precautions, from testing GCC (v11.2.1 20210728) seems to optimize out the functions at -O3
, but not -O2
or below.
Results
Updated code (gist)
#include <assert.h>
#include <stdio.h>
#include <time.h>
#define NNN 2048
#define BENCH_COUNT 10
#define timespecsub(tsp, usp, vsp) \
do \
{ \
(vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \
(vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \
if ((vsp)->tv_nsec < 0) \
{ \
(vsp)->tv_sec--; \
(vsp)->tv_nsec += 1000000000L; \
} \
} while (0)
static void benchmark(void (*f)(void), const char *name);
static void copyij();
static void copyji();
static void init_mat();
static volatile int src[NNN][NNN], dst[NNN][NNN];
int main(void)
{
size_t i;
printf("ij:\n");
for (i = 0; i < BENCH_COUNT; i++)
{
benchmark(copyij, "copyij");
}
printf("ji:\n");
for (i = 0; i < BENCH_COUNT; i++)
{
benchmark(copyji, "copyji");
}
}
static void
benchmark(void (*f)(void), const char *name)
{
struct timespec start, end, delta;
init_mat();
assert(clock_gettime(CLOCK_MONOTONIC, &start) != -1);
f();
assert(clock_gettime(CLOCK_MONOTONIC, &end) != -1);
timespecsub(&end, &start, &delta);
printf("%s: NNN=%d: elapsed=%lld.%.9ld secs\n", name, NNN, delta.tv_sec,
delta.tv_nsec);
}
#pragma GCC push_options
#pragma GCC optimize("O0")
static void
copyij()
{
size_t i, j;
for (i = 0; i < NNN; i++)
{
for (j = 0; j < NNN; j++)
{
src[i][j] += 1;
}
}
}
static void
copyji()
{
size_t i, j;
for (i = 0; i < NNN; i++)
{
for (j = 0; j < NNN; j++)
{
dst[j][i] += 1;
}
}
}
#pragma GCC pop_options
static void
init_mat()
{
size_t i, j;
for (i = 0; i < NNN; i++)
{
for (j = 0; j < NNN; j++)
{
src[i][j] = dst[i][j] = 1;
}
}
}
Output (i7-7500U @ 2.7 GHz, inside WSL, -O3
)
ij:
copyij: NNN=2048: elapsed=0.020582100 secs
copyij: NNN=2048: elapsed=0.016620800 secs
copyij: NNN=2048: elapsed=0.016156000 secs
copyij: NNN=2048: elapsed=0.017765700 secs
copyij: NNN=2048: elapsed=0.016158500 secs
copyij: NNN=2048: elapsed=0.016127900 secs
copyij: NNN=2048: elapsed=0.016153200 secs
copyij: NNN=2048: elapsed=0.016337300 secs
copyij: NNN=2048: elapsed=0.016625900 secs
copyij: NNN=2048: elapsed=0.016512300 secs
ji:
copyji: NNN=2048: elapsed=0.055380300 secs
copyji: NNN=2048: elapsed=0.056751900 secs
copyji: NNN=2048: elapsed=0.055770200 secs
copyji: NNN=2048: elapsed=0.056378700 secs
copyji: NNN=2048: elapsed=0.057477700 secs
copyji: NNN=2048: elapsed=0.058508900 secs
copyji: NNN=2048: elapsed=0.058080200 secs
copyji: NNN=2048: elapsed=0.057968100 secs
copyji: NNN=2048: elapsed=0.058937900 secs
copyji: NNN=2048: elapsed=0.056836300 secs
I suspect ji is slower because it operates on noncontiguous memory.