I have been running various tests to investigate this surprising result.
I wrote the test program below that combines various operations in the init phase and the loop:
#include <stdio.h>
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <sys/stat.h>
int alloc_size = 500 * 0x100000; // 500 MB
char copy_buf[0x200000]; // 2MB
double getTS() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec/1000000.0;
}
// set a word on each page of a memory area
void memtouch(void *buf, size_t size) {
uint64_t *p = buf;
size /= sizeof(*p);
for (size_t i = 0; i < size; i += 4096 / sizeof(*p))
p[i] = 0;
}
// compute the sum of words on a memory area
uint64_t sum64(const void *buf, size_t size) {
uint64_t sum = 0;
const uint64_t *p = buf;
size /= sizeof(*p);
for (size_t i = 0; i < size; i++)
sum += p[i];
return sum;
}
void test1(int n, int init, int run) {
int size = alloc_size;
char msg[80];
int pos = 0;
double times[n+1];
uint64_t sum = 0;
double initTS = getTS();
char *p = malloc(size);
pos = snprintf(msg + pos, sizeof msg - pos, "malloc");
if (init > 0) {
memset(p, init - 1, size);
pos += snprintf(msg + pos, sizeof msg - pos, "+memset%.0d", init - 1);
} else
if (init == -1) {
memtouch(p, size);
pos += snprintf(msg + pos, sizeof msg - pos, "+memtouch");
} else
if (init == -2) {
sum = sum64(p, size);
pos += snprintf(msg + pos, sizeof msg - pos, "+sum64");
} else {
/* leave p uninitialized */
}
pos += snprintf(msg + pos, sizeof msg - pos, "+rep(%d, ", n);
if (run > 0) {
pos += snprintf(msg + pos, sizeof msg - pos, "memset%.0d)", run - 1);
} else
if (run < 0) {
pos += snprintf(msg + pos, sizeof msg - pos, "sum64)");
} else {
pos += snprintf(msg + pos, sizeof msg - pos, "memcpy)");
}
double startTS = getTS();
for (int i = 0; i < n; i++) {
if (run > 0) {
memset(p, run - 1, size);
} else
if (run < 0) {
sum = sum64(p, size);
} else {
int offset = 0;
while (offset < size) {
memcpy(copy_buf, p + offset, 0x200000);
offset += 0x200000;
}
}
times[i] = getTS();
}
double firstTS = times[0] - startTS;
printf("%f + %f", startTS - initTS, firstTS);
if (n > 2) {
double avgTS = (times[n - 2] - times[0]) / (n - 2);
printf(" / %f", avgTS);
}
if (n > 1) {
double lastTS = times[n - 1] - times[n - 2];
printf(" / %f", lastTS);
}
printf(" secs %s", msg);
if (sum != 0) {
printf(" sum=%016llx", (unsigned long long)sum);
}
printf("\n");
free(p);
}
int main(int argc, char *argv[]) {
int n = 4;
if (argc < 2) {
test1(n, 0, 0);
test1(n, 0, 1);
test1(n, 0, -1);
test1(n, 1, 0);
test1(n, 1, 1);
test1(n, 1, -1);
test1(n, 2, 0);
test1(n, 2, 1);
test1(n, 2, -1);
test1(n, -1, 0);
test1(n, -1, 1);
test1(n, -1, -1);
test1(n, -2, 0);
test1(n, -2, 1);
test1(n, -2, -1);
} else {
test1(argc > 1 ? strtol(argv[1], NULL, 0) : n,
argc > 2 ? strtol(argv[2], NULL, 0) : 0,
argc > 3 ? strtol(argv[2], NULL, 0) : 0);
}
return 0;
}
Running it on an old linux box running Debian Linux 3.16.0-11-amd64, I got these timings:
The columns are
- init phase
- first iteration of the loop
- average of the second to penultimate iterations
- last iteration of the loop
- sequence of operations
0.000071 + 0.242601 / 0.113761 / 0.113711 secs malloc+rep(4, memcpy)
0.000032 + 0.349896 / 0.125809 / 0.125681 secs malloc+rep(4, memset)
0.000032 + 0.190461 / 0.049150 / 0.049210 secs malloc+rep(4, sum64)
0.350089 + 0.186691 / 0.186705 / 0.186548 secs malloc+memset+rep(4, memcpy)
0.350078 + 0.125603 / 0.125632 / 0.125531 secs malloc+memset+rep(4, memset)
0.349931 + 0.105991 / 0.105859 / 0.105788 secs malloc+memset+rep(4, sum64)
0.349860 + 0.186950 / 0.187031 / 0.186494 secs malloc+memset1+rep(4, memcpy)
0.349584 + 0.125537 / 0.125525 / 0.125535 secs malloc+memset1+rep(4, memset)
0.349620 + 0.106026 / 0.106114 / 0.105756 secs malloc+memset1+rep(4, sum64) sum=ebebebebebe80000
0.339846 + 0.186593 / 0.186686 / 0.186498 secs malloc+memtouch+rep(4, memcpy)
0.340156 + 0.125663 / 0.125716 / 0.125751 secs malloc+memtouch+rep(4, memset)
0.340141 + 0.105861 / 0.105806 / 0.105869 secs malloc+memtouch+rep(4, sum64)
0.190330 + 0.113774 / 0.113730 / 0.113754 secs malloc+sum64+rep(4, memcpy)
0.190149 + 0.400483 / 0.125638 / 0.125624 secs malloc+sum64+rep(4, memset)
0.190214 + 0.049136 / 0.049170 / 0.049149 secs malloc+sum64+rep(4, sum64)
The timings are consistent with the observations of the OP. I found an explanation that is consistent with the observed timings:
if the first access to a page is a read operation, the timings are substantially better than if the first operation is a write access.
Here are some observations consistent with this explanation:
malloc()
for a large 500MB block just makes a system call to map the memory, it does not access this memory and calloc
probably would do exactly the same thing.
- if you do not initialize this memory, it still gets mapped in RAM as zero initialized pages for security reasons.
- when you initialize the memory with
memset
, the first access to the whole block is a write access, then the timings for the loop are slower
- initializing the memory to all bytes
1
produces exactly the same timings
- if instead I use
memtouch
, only writing the first word to zero, I get the same timings in the loop
- conversely, if instead of initializing the memory, I compute a checksum, it comes out as zero (which is not guarateed, but expected) and the timings in the loop are faster.
- if no access is performed to the block, then the timings in the loop depend on the operation performed:
memcpy
and sum64
are faster because the first access is a read access, while memset
is slower because the first access is a write access.
This seems specific to the linux kernel, I do not observe the same difference on macOS, but with a different processor. This behavior might be specific to older linux kernels and/or CPU and bridge architecture.
FINAL UPDATE: As commented by Peter Cordes, a read from a never-written anonymous page will get every page copy-on-write mapped to the same physical page of zeros, so you can get TLB misses but L1d cache hits when reading it. (Applies to the .bss
, and memory from mmap(MAP_ANONYMOUS)
, like glibc calloc
and malloc
use for large allocations.) He wrote up some details with perf
results for an experiment on Why is iterating though `std::vector` faster than iterating though `std::array`?
This explains why memcpy
or just reading from memory that is only implicitly initialized to zero is faster than from memory explicitly written to. A good reason to use calloc()
instead of malloc()
+ memset()
for sparse data.