Here's some working code that represents a more or less working version of your code, with logging added amongst other things. The logging makes it much easier to see that it is working plausibly.
#include "stderr.h"
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#define N 5
static int newProcess(void)
{
int pid = fork();
if (pid == 0)
{
// work - this process goes to sleep on the job!
struct timespec nap = { .tv_sec = rand() % 3, .tv_nsec = rand() % 1000000000 };
nanosleep(&nap, 0);
err_remark("About to do %ld.%9ld seconds work\n", (long)nap.tv_sec, nap.tv_nsec);
int rc = 0;
if (rand() % 100 > 90)
rc = rand() % 255;
err_remark("Work completed - exit status %d\n", rc);
exit(rc);
}
if (pid > 0 && rand() % 100 > 90)
{
kill(pid, rand() % 8 + 1);
errno = EAGAIN;
pid = -1;
}
return pid;
}
static inline int check_child(int pid)
{
#undef SIGNONE
enum { SIGNONE = 0 };
int rc = kill(pid, SIGNONE);
err_remark("PID %d - %s\n", pid, (rc == 0) ? "OK" : "Dead");
return rc;
}
static void process_check(int *npids, int pids[])
{
err_remark("Checking PID list\n");
for (int i = 0; i < *npids; i++)
{
while (check_child(pids[i]) != 0)
{
// Child is presumably dead!
if (*npids > 0)
pids[i] = pids[--*npids];
}
}
}
int main(int argc, char **argv)
{
err_setarg0(argv[0]);
if (argc != 1)
err_usage(" # No arguments allowed");
int child_pid[N] = { 0 };
srand(time(0));
err_setlogopts(ERR_PID | ERR_MICRO);
int processCount = 0;
while (1)
{
if (processCount < N)
{
int pid = newProcess();
if (pid > 0)
{
child_pid[processCount++] = pid;;
err_remark("PID %d started\n", pid);
}
else
{
assert(pid == -1);
int errnum = errno;
err_sysrem("Failed to fork");
process_check(&processCount, child_pid);
if (errnum == EAGAIN)
{
struct timespec nap = { .tv_sec = 0, .tv_nsec = (rand() % 10 + 1) * 1000000 };
nanosleep(&nap, 0); // sleep 1-10 milliseconds (could be too big).
}
}
}
else
{
int corpse;
int status;
if ((corpse = wait(&status)) > 0)
{
int known_pid = 0;
for (int i = 0; i < processCount; i++)
{
if (child_pid[i] == corpse)
{
err_remark("PID %d exit status 0x%.4X\n", corpse, status);
known_pid = 1;
child_pid[i] = child_pid[--processCount];
break;
}
}
if (!known_pid)
err_remark("Unknown PID %d exit status 0x%.4X - ignored\n", corpse, status);
}
}
}
return 0;
}
The header stderr.h
and its companion source code stderr.c
can be found at GitHub in my SOQ repository in the libsoq
folder. It provides a convenient and configurable logging service.
Note that the test code fakes some failures, and kills some children, etc. You'd remove that production code. You might well retain most of the logging, especially if the children normally work for many seconds at a time rather than just a few as in this example.
Some sample output:
$ ./mon61
mon61: 2017-12-01 09:48:03.636756 - pid=74353: PID 74354 started
mon61: 2017-12-01 09:48:03.637568 - pid=74353: PID 74355 started
mon61: 2017-12-01 09:48:03.637724 - pid=74353: PID 74356 started
mon61: 2017-12-01 09:48:03.637885 - pid=74353: PID 74357 started
mon61: 2017-12-01 09:48:03.638048 - pid=74353: PID 74358 started
mon61: 2017-12-01 09:48:03.747398 - pid=74356: About to do 0.108225168 seconds work
mon61: 2017-12-01 09:48:03.748152 - pid=74356: Work completed - exit status 0
mon61: 2017-12-01 09:48:03.748791 - pid=74353: PID 74356 exit status 0x0000
mon61: 2017-12-01 09:48:03.749046 - pid=74353: PID 74359 started
mon61: 2017-12-01 09:48:04.032219 - pid=74359: About to do 0.281932019 seconds work
mon61: 2017-12-01 09:48:04.032971 - pid=74359: Work completed - exit status 0
mon61: 2017-12-01 09:48:04.033747 - pid=74353: PID 74359 exit status 0x0000
mon61: 2017-12-01 09:48:04.034007 - pid=74353: PID 74361 started
mon61: 2017-12-01 09:48:04.602396 - pid=74355: About to do 0.964067315 seconds work
mon61: 2017-12-01 09:48:04.602951 - pid=74355: Work completed - exit status 0
mon61: 2017-12-01 09:48:04.603596 - pid=74353: PID 74355 exit status 0x0000
mon61: 2017-12-01 09:48:04.603855 - pid=74353: PID 74362 started
mon61: 2017-12-01 09:48:05.419466 - pid=74358: About to do 1.780199743 seconds work
mon61: 2017-12-01 09:48:05.420017 - pid=74358: Work completed - exit status 0
mon61: 2017-12-01 09:48:05.420669 - pid=74353: PID 74358 exit status 0x0000
mon61: 2017-12-01 09:48:05.420923 - pid=74353: PID 74363 started
mon61: 2017-12-01 09:48:05.453929 - pid=74357: About to do 1.814728145 seconds work
mon61: 2017-12-01 09:48:05.454320 - pid=74357: Work completed - exit status 0
mon61: 2017-12-01 09:48:05.454753 - pid=74353: PID 74357 exit status 0x0000
mon61: 2017-12-01 09:48:05.454939 - pid=74353: PID 74364 started
mon61: 2017-12-01 09:48:05.512822 - pid=74354: About to do 1.875699204 seconds work
mon61: 2017-12-01 09:48:05.514094 - pid=74354: Work completed - exit status 0
mon61: 2017-12-01 09:48:05.514349 - pid=74353: PID 74354 exit status 0x0000
mon61: 2017-12-01 09:48:05.514658 - pid=74353: PID 74365 started
mon61: 2017-12-01 09:48:06.004823 - pid=74362: About to do 1.399425773 seconds work
mon61: 2017-12-01 09:48:06.005581 - pid=74362: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.006237 - pid=74353: PID 74362 exit status 0x0000
mon61: 2017-12-01 09:48:06.006523 - pid=74353: Failed to forkerror (35) Resource temporarily unavailable
mon61: 2017-12-01 09:48:06.006562 - pid=74353: Checking PID list
mon61: 2017-12-01 09:48:06.006570 - pid=74353: PID 74364 - OK
mon61: 2017-12-01 09:48:06.006576 - pid=74353: PID 74361 - OK
mon61: 2017-12-01 09:48:06.006582 - pid=74353: PID 74365 - OK
mon61: 2017-12-01 09:48:06.006588 - pid=74353: PID 74363 - OK
mon61: 2017-12-01 09:48:06.013228 - pid=74353: PID 74368 started
mon61: 2017-12-01 09:48:06.013267 - pid=74353: Unknown PID 74366 exit status 0x0006 - ignored
mon61: 2017-12-01 09:48:06.117089 - pid=74361: About to do 2. 82518051 seconds work
mon61: 2017-12-01 09:48:06.117618 - pid=74361: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.118206 - pid=74353: PID 74361 exit status 0x0000
mon61: 2017-12-01 09:48:06.118486 - pid=74353: PID 74369 started
mon61: 2017-12-01 09:48:06.537455 - pid=74363: About to do 1.115086289 seconds work
mon61: 2017-12-01 09:48:06.537967 - pid=74363: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.538610 - pid=74353: PID 74363 exit status 0x0000
mon61: 2017-12-01 09:48:06.538880 - pid=74353: PID 74371 started
mon61: 2017-12-01 09:48:06.682182 - pid=74371: About to do 0.141922802 seconds work
mon61: 2017-12-01 09:48:06.682945 - pid=74371: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.683733 - pid=74353: PID 74371 exit status 0x0000
mon61: 2017-12-01 09:48:06.684007 - pid=74353: PID 74372 started
mon61: 2017-12-01 09:48:06.975561 - pid=74364: About to do 1.519976923 seconds work
mon61: 2017-12-01 09:48:06.976341 - pid=74364: Work completed - exit status 188
mon61: 2017-12-01 09:48:06.976942 - pid=74353: PID 74364 exit status 0xBC00
mon61: 2017-12-01 09:48:06.977225 - pid=74353: PID 74373 started
mon61: 2017-12-01 09:48:07.436814 - pid=74368: About to do 1.422967208 seconds work
mon61: 2017-12-01 09:48:07.437600 - pid=74368: Work completed - exit status 0
mon61: 2017-12-01 09:48:07.438230 - pid=74353: PID 74368 exit status 0x0000
Scrutiny of the log shows that there are some 'unknown PID' dying messages. That suggests there is some work to be done in the management of the array of PIDs (aka 'fix bugs'). I may get to look into that later.
Looking at the code, those are 'expected'. There is an approximately 9% chance that there is a child created but killed by signal (and all those children have the the status set to a value 0x0001 to 0x0008, indicating death by signal). For those processes, the return status from newProcess()
is -1
which prevents the PID from being entered into the list of known children, so when the child does die and the status information is collected, the PID is 'unknown'. In other words, this is 'expected' behaviour. Such processes could be noted better by negating the PID returned to the calling process, and writing a message indicating that this particular child PID was created but died from a signal (probably before the child had a chance to do anything such as report that it is running).
The err_remark()
call related to "About to do N.xxxxxxxxxx seconds work"
is both misplaced and misformatted. It should come before, not after, the nanosleep. It should also use %.9d
instead of %9d
to format the fractional time. Both are easily fixed.
There are various improvements to be made, apart from having the children do real work instead of just sleeping on the job. The code could handle some signals (interrupt to check the children, hangup to reread a configuration file, terminate to kill the children and exit, for example). It could write to a log file instead of standard error. It could be daemonized instead of running in the foreground. It could have options to control the log file directory, and perhaps the log file name. It could detect if/when its log file is deleted and start a new one. Etc.
But this gives you something to play with.