The line of code
next += val;
declines performance to 10x, I have checked ASM code, not result.
Why this line of code declines performance to 10x?
Here is the result:
➜ ~ clang-13 1.c -O3
➜ ~ ./a.out
rand_read_1
sum = 2624b18779c40, time = 0.19s
rand_read_2
sum = 2624b18779c40, time = 1.24s
CPU: Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz
Here is the code:
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#define CCR_MULTIPLY_64 6364136223846793005
#define CCR_ADD_64 1
static inline uint64_t my_rand64(uint64_t *r)
{
*r = *r * CCR_MULTIPLY_64 + CCR_ADD_64;
return *r;
}
#define NUM 10000000UL
uint64_t rand_read_1(uint64_t *ptr, uint64_t nr_words)
{
uint64_t i, next, val = 0;
uint64_t sum;
next = 0;
sum = 0;
for (i = 0; i < NUM; i++) {
my_rand64(&next);
next %= nr_words;
val = ptr[next];
sum += val ^ next;
// printf("next1:%ld\n", next);
}
return sum;
}
uint64_t rand_read_2(uint64_t *ptr, uint64_t nr_words)
{
uint64_t i, next, val ,next2 = 0;
uint64_t sum;
next = 0;
sum = 0;
for (i = 0; i < NUM; i++) {
my_rand64(&next);
next %= nr_words;
val = ptr[next];
sum += val ^ next;
next += val;
}
return sum;
}
#define SIZE (1024*1024*1024)
static uint64_t get_ns(void)
{
struct timespec val;
uint64_t v;
int ret;
ret = clock_gettime(CLOCK_REALTIME, &val);
if (ret != 0) {
perror("clock_gettime");
exit(1);
}
v = (uint64_t) val.tv_sec * 1000000000LL;
v += (uint64_t) val.tv_nsec;
return v;
}
int main(int argc, char *argv[])
{
uint64_t *ptr;
uint64_t sum;
uint64_t t0, t1, td, t2;
ptr = (uint64_t *)malloc(SIZE);
assert(ptr);
memset(ptr, 0, SIZE);
t0 = get_ns();
printf("rand_read_1\n");
sum = rand_read_1(ptr, SIZE/8);
t1 = get_ns();
td = t1 - t0;
printf("sum = %lx, time = %.2fs\n", sum, td/1E9);
printf("rand_read_2\n");
sum = rand_read_2(ptr, SIZE/8);
t2 = get_ns();
td = t2 - t1;
printf("sum = %lx, time = %.2fs\n", sum, td/1E9);
return 0;
}