As suggested by @PerJohansson, I wrote & tested following code, it works well on linux, using mmap with MAP_SHARED|MAP_FIXED flag, we can map the same physical page allocated by POSIX shm object multiple times and continuously into very large virtual memory.
#include "stdio.h"
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h> /* For mode constants */
#include <fcntl.h> /* For O_* constants */
void * alloc_1page_mem(int size) {
int fd;
char * ptr_base;
char * rptr;
/* Create shared memory object and set its size */
fd = shm_open("/myregion", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
if (fd == -1) {
perror("error in shm_open");
return NULL;
}
if (ftruncate(fd, 4096) == -1) {
perror("error in ftruncate");
return NULL;
}
// following trick reserves big enough holes in VM space
ptr_base = rptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
munmap(rptr, size);
for(int i=0; i<size; i+=4096) {
rptr = mmap(rptr, 4096, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_FIXED, fd, 0);
if (rptr == MAP_FAILED) {
perror("error in mmap");
return NULL;
}
rptr += 4096;
}
close(fd);
shm_unlink("/myregion");
return ptr_base;
}
void check(int * p, int total_cnt){
for (int i=0;i<4096/sizeof(int);i++) {
p[i] = i;
}
int fail_cnt = 0;
for (int k=0; k<total_cnt; k+= 4096/sizeof(int)) {
for (int i=0;i<4096/sizeof(int);i++) {
if (p[k+i] != i)
fail_cnt ++;
}
}
printf("fail_cnt=%d\n", fail_cnt);
}
int main(int argc, const char * argv[]) {
const char * cmd = argv[1];
int sum;
int total_cnt = 32*1024*1024;
int * p = NULL;
if (*cmd++ == '1')
p = alloc_1page_mem(total_cnt*sizeof(int));
else
p = malloc(total_cnt*sizeof(int));
sum = 0;
while(*cmd) {
switch(*cmd++) {
case 'c':
check(p, total_cnt);
break;
case 'w':
// save only 4bytes per cache line
for (int k=0;k<total_cnt;k+=64/sizeof(int)){
p[k] = sum;
}
break;
case 'r':
// read only 4bytes per cache line
for (int k=0;k<total_cnt;k+=64/sizeof(int)) {
sum += p[k];
}
break;
case 'p':
// prevent sum from being optimized
printf("sum=%d\n", sum);
}
}
return 0;
}
You can observe very low cache miss rate on memory allocated in such method:
$ sudo perf stat -e mem_load_retired.l3_miss -- ./a.out 0wrrrrr
# this produces L3 miss linearly increase with number of 'r' charaters
$ sudo perf stat -e mem_load_retired.l3_miss -- ./a.out 1wrrrrr
# this produces almost constant L3 miss.