To illustrate the copy on write feature, I have just developed the following code snippet:
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h> // for system()
#include <time.h>
#include <unistd.h> // for execl(), fork()
#include <wait.h> // for wait()
#define D(x) __FILE__":%d:%s: " x, __LINE__, __func__
#define PAGESIZE 4096U
#define NPAGES 100000U
/* auxiliary routines to do time accounting */
t_start(struct timespec *t)
{
clock_gettime(CLOCK_REALTIME, t);
} /* t_start */
t_print(struct timespec *t)
{
struct timespec now;
clock_gettime(CLOCK_REALTIME, &now); /* stop */
now.tv_nsec -= t->tv_nsec;
now.tv_sec -= t->tv_sec;
if (now.tv_nsec < 0) {
now.tv_nsec += 1000000000L;
now.tv_sec--;
} /* if */
printf(D("elapsed: %d.%09d\n"),
now.tv_sec, now.tv_nsec);
} /* t_print */
void testv(
struct timespec *t,
char *b,
size_t bs,
const char *fmt,
va_list p)
{
int i;
static char c = 0;
vprintf(fmt, p);
t_start(t);
for (i = 0; i < bs; i += PAGESIZE)
b[i] = c;
c++;
t_print(t);
} /* testv */
void test(struct timespec *t,
char *b,
size_t bs,
const char *fmt,
...)
{
va_list p;
va_start(p, fmt);
testv(t, b, bs, fmt, p);
va_end(p);
} /* test */
int main(int argc, char *argv[])
{
static char buffer[NPAGES*PAGESIZE];
struct timespec ts;
int i, res;
test(&ts, buffer, sizeof buffer,
D("The first test (expect high time--page allocating)\n"));
test(&ts, buffer, sizeof buffer,
D("The second test (expect low time)\n"));
switch(res = fork()) {
case -1:
fprintf(stderr,
D("Cannot fork: %s(errno=%d)\n"),
strerror(errno), errno);
exit(EXIT_FAILURE);
case 0: /* child */
test(&ts, buffer, sizeof buffer,
D("child[%d]: third test (expect high time--copy on write)\n"),
getpid());
test(&ts, buffer, sizeof buffer,
D("child[%d]: fourth test (expect low time)\n"),
getpid());
exit(EXIT_SUCCESS);
default: /* parent */
printf(D("parent[%d]: waiting for child[%d] to finish\n"),
getpid(), res);
wait(NULL); /* expect so the calls don't get intermixed */
test(&ts, buffer, sizeof buffer,
D("parent[%d]: third test (expect medium time--swapping)\n"),
getpid());
test(&ts, buffer, sizeof buffer,
D("parent[%d]: third test (expect low time)\n"),
getpid());
exit(EXIT_SUCCESS);
} /* if */
/*NOTREACHED*/
} /* main */
I'll try to explain the code now:
- first some functions are prepared to do some timestamping and allow to calculate times from the code execution.
t_start()
just starts a timer and t_print()
takes a second timestamp and prints de difference time between last t_start()
and now.
test()
does some test. It measures the time to write the whole buffer (in jumps of one page to make it run as fast as possible, to show the page faults that happen on it)
main()
does some tests before fork()
ing, as we want to assure all buffer pages exist and are allocated before forking. We shall see also that the initial time to get all pages in core memory is on the same order as child needs to do the copy on write (well, 50% more in the child)
main()
fork()
s a child and waits for it to execute. I could have done the test in the reverse, but the parent can wait()
for the child, but not in the opposite.
- then, both, parent and child do some tests, showing that the child has to go 50% further to wait for the copy on write thing to happen.
The program results are here:
$ time pru
pru.c:70:main: The first test (expect high time--page allocating)
pru.c:30:t_print: elapsed: 0.126230771
pru.c:72:main: The second test (expect low time)
pru.c:30:t_print: elapsed: 0.002087815
pru.c:82:main: child[4392]: third test (expect high time--copy on write)
pru.c:30:t_print: elapsed: 0.152463844
pru.c:85:main: child[4392]: fourth test (expect low time)
pru.c:30:t_print: elapsed: 0.001906929
pru.c:70:main: The first test (expect high time--page allocating)
pru.c:30:t_print: elapsed: 0.126230771
pru.c:72:main: The second test (expect low time)
pru.c:30:t_print: elapsed: 0.002087815
pru.c:89:main: parent[4390]: waiting for child[4392] to finish
pru.c:93:main: parent[4390]: third test (expect medium time--swapping)
pru.c:30:t_print: elapsed: 0.046004906
pru.c:96:main: parent[4390]: third test (expect low time)
pru.c:30:t_print: elapsed: 0.001905371
0m0.35s real 0m0.05s user 0m0.30s system
As you can see, almost 90% the time is system time, time the system has had to page in.
All the tests have been made in a i5/8Gb mem/Debian 64bit, so, as the memory requirements for parent and child was the whole memory (1000000 pages of 4K makes 4Gb per process) the timing for swap in is justified.
on the other side, I have modified the program to measure the time in the fork()
showing:
$ pru
pru.c:70:main: pid=4786: The first test (expect high time--page allocating)
pru.c:30:t_print: pid=4786: elapsed: 0.128644899
pru.c:73:main: pid=4786: The second test (expect low time)
pru.c:30:t_print: pid=4786: elapsed: 0.001815846
pru.c:85:main: child[4788]: fork time:
pru.c:30:t_print: pid=4788: elapsed: 0.003451545
pru.c:88:main: child[4788]: third test (expect high time--copy on write)
pru.c:30:t_print: pid=4788: elapsed: 0.151911675
pru.c:91:main: child[4788]: fourth test (expect low time)
pru.c:30:t_print: pid=4788: elapsed: 0.001831539
pru.c:70:main: pid=4786: The first test (expect high time--page allocating)
pru.c:30:t_print: pid=4786: elapsed: 0.128644899
pru.c:73:main: pid=4786: The second test (expect low time)
pru.c:30:t_print: pid=4786: elapsed: 0.001815846
pru.c:95:main: parent[4786]: fork time:
pru.c:30:t_print: pid=4786: elapsed: 0.003405556
pru.c:97:main: parent[4786]: waiting for child[4788] to finish
pru.c:101:main: parent[4786]: third test (expect medium time--swapping)
pru.c:30:t_print: pid=4786: elapsed: 0.046373034
pru.c:104:main: parent[4786]: third test (expect low time)
pru.c:30:t_print: pid=4786: elapsed: 0.001782398
As you can see, fork times (in the parent or in the child) are by far too low compared with the time to do the copy (on writing the pages).
Following is the final code with fork()
timestamping:
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h> // for system()
#include <time.h>
#include <unistd.h> // for execl(), fork()
#include <wait.h> // for wait()
#define D(x) __FILE__":%d:%s: " x, __LINE__, __func__
#define PAGESIZE 4096U
#define NPAGES 100000U
/* auxiliary routines to do time accounting */
t_start(struct timespec *t)
{
clock_gettime(CLOCK_REALTIME, t);
} /* t_start */
t_print(struct timespec *t)
{
struct timespec now;
clock_gettime(CLOCK_REALTIME, &now); /* stop */
now.tv_nsec -= t->tv_nsec;
now.tv_sec -= t->tv_sec;
if (now.tv_nsec < 0) {
now.tv_nsec += 1000000000L;
now.tv_sec--;
} /* if */
printf(D("pid=%d: elapsed: %d.%09d\n"),
getpid(), now.tv_sec, now.tv_nsec);
} /* t_print */
void testv(
struct timespec *t,
char *b,
size_t bs,
const char *fmt,
va_list p)
{
int i;
static char c = 0;
vprintf(fmt, p);
t_start(t);
for (i = 0; i < bs; i += PAGESIZE)
b[i] = c;
c++;
t_print(t);
} /* testv */
void test(struct timespec *t,
char *b,
size_t bs,
const char *fmt,
...)
{
va_list p;
va_start(p, fmt);
testv(t, b, bs, fmt, p);
va_end(p);
} /* test */
int main(int argc, char *argv[])
{
static char buffer[NPAGES*PAGESIZE];
struct timespec ts;
int i, res, pid = getpid();
test(&ts, buffer, sizeof buffer,
D("pid=%d: The first test (expect high time--page allocating)\n"),
pid);
test(&ts, buffer, sizeof buffer,
D("pid=%d: The second test (expect low time)\n"),
pid);
t_start(&ts);
switch(res = fork()) {
case -1:
fprintf(stderr,
D("Cannot fork: %s(errno=%d)\n"),
strerror(errno), errno);
exit(EXIT_FAILURE);
case 0: /* child */
pid = getpid();
printf(D("child[%d]: fork time:\n"), pid);
t_print(&ts);
test(&ts, buffer, sizeof buffer,
D("child[%d]: third test (expect high time--copy on write)\n"),
getpid());
test(&ts, buffer, sizeof buffer,
D("child[%d]: fourth test (expect low time)\n"),
getpid());
exit(EXIT_SUCCESS);
default: /* parent */
printf(D("parent[%d]: fork time:\n"), pid);
t_print(&ts);
printf(D("parent[%d]: waiting for child[%d] to finish\n"),
getpid(), res);
wait(NULL); /* expect so the calls don't get intermixed */
test(&ts, buffer, sizeof buffer,
D("parent[%d]: third test (expect medium time--swapping)\n"),
getpid());
test(&ts, buffer, sizeof buffer,
D("parent[%d]: third test (expect low time)\n"),
getpid());
exit(EXIT_SUCCESS);
} /* switch */
/*NOTREACHED*/
} /* main */
NOTE
In the output appears the first part of the output twice (one in the parent and one in the child process) As printf()
works by buffering (and I have filtered output to sed -e 's/^/ /'
to include the four spaces required to paste the output here as a code snippet, and the contents of the buffered output have not been flushed out at fork()
's time, it appears in both outputs, the parent's and the child's. I have included it to show that the data segments of both, parent and child, are exactly equal, including <stdio.h>
internal structures like non-flushed output. A fflush(stdout);
or fflush(NULL);
before fork()
solves this problem, making it only appear once.