Here's my full solution which adds time control and gives the number of failed jobs. Also takes care to kill children of failed jobs if needed, and deals with zombie or uninterruptible processes:
function Logger {
echo "$1"
}
# Portable child (and grandchild) kill function tester under Linux, BSD and MacOS X
function KillChilds {
local pid="${1}" # Parent pid to kill childs
local self="${2:-false}" # Should parent be killed too ?
if children="$(pgrep -P "$pid")"; then
KillChilds "$child" true
done
fi
# Try to kill nicely, if not, wait 15 seconds to let Trap actions happen before killing
if ( [ "$self" == true ] && kill -0 $pid > /dev/null 2>&1); then
kill -s TERM "$pid"
if [ $? != 0 ]; then
sleep 15
Logger "Sending SIGTERM to process [$pid] failed."
kill -9 "$pid"
if [ $? != 0 ]; then
Logger "Sending SIGKILL to process [$pid] failed."
return 1
fi
else
return 0
fi
else
return 0
fi
}
function WaitForTaskCompletion {
local pids="${1}" # pids to wait for, separated by semi-colon
local soft_max_time="${2}" # If program with pid $pid takes longer than $soft_max_time seconds, will log a warning, unless $soft_max_time equals 0.
local hard_max_time="${3}" # If program with pid $pid takes longer than $hard_max_time seconds, will stop execution, unless $hard_max_time equals 0.
local caller_name="${4}" # Who called this function
local counting="${5:-true}" # Count time since function has been launched if true, since script has been launched if false
local keep_logging="${6:-0}" # Log a standby message every X seconds. Set to zero to disable logging
local soft_alert=false # Does a soft alert need to be triggered, if yes, send an alert once
local log_ttime=0 # local time instance for comparaison
local seconds_begin=$SECONDS # Seconds since the beginning of the script
local exec_time=0 # Seconds since the beginning of this function
local retval=0 # return value of monitored pid process
local errorcount=0 # Number of pids that finished with errors
local pid # Current pid working on
local pidCount # number of given pids
local pidState # State of the process
local pidsArray # Array of currently running pids
local newPidsArray # New array of currently running pids
IFS=';' read -a pidsArray <<< "$pids"
pidCount=${#pidsArray[@]}
WAIT_FOR_TASK_COMPLETION=""
while [ ${#pidsArray[@]} -gt 0 ]; do
newPidsArray=()
Spinner
if [ $counting == true ]; then
exec_time=$(($SECONDS - $seconds_begin))
else
exec_time=$SECONDS
fi
if [ $keep_logging -ne 0 ]; then
if [ $((($exec_time + 1) % $keep_logging)) -eq 0 ]; then
if [ $log_ttime -ne $exec_time ]; then # Fix when sleep time lower than 1s
log_ttime=$exec_time
fi
fi
fi
if [ $exec_time -gt $soft_max_time ]; then
if [ $soft_alert == true ] && [ $soft_max_time -ne 0 ]; then
Logger "Max soft execution time exceeded for task [$caller_name] with pids [$(joinString , ${pidsArray[@]})]."
soft_alert=true
SendAlert true
fi
if [ $exec_time -gt $hard_max_time ] && [ $hard_max_time -ne 0 ]; then
Logger "Max hard execution time exceeded for task [$caller_name] with pids [$(joinString , ${pidsArray[@]})]. Stopping task execution."
for pid in "${pidsArray[@]}"; do
KillChilds $pid true
if [ $? == 0 ]; then
Logger "Task with pid [$pid] stopped successfully." "NOTICE"
else
Logger "Could not stop task with pid [$pid]." "ERROR"
fi
done
SendAlert true
errrorcount=$((errorcount+1))
fi
fi
for pid in "${pidsArray[@]}"; do
if [ $(IsNumeric $pid) -eq 1 ]; then
if kill -0 $pid > /dev/null 2>&1; then
# Handle uninterruptible sleep state or zombies by ommiting them from running process array (How to kill that is already dead ? :)
#TODO(high): have this tested on *BSD, Mac & Win
pidState=$(ps -p$pid -o state= 2 > /dev/null)
if [ "$pidState" != "D" ] && [ "$pidState" != "Z" ]; then
newPidsArray+=($pid)
fi
else
# pid is dead, get it's exit code from wait command
wait $pid
retval=$?
if [ $retval -ne 0 ]; then
errorcount=$((errorcount+1))
Logger "${FUNCNAME[0]} called by [$caller_name] finished monitoring [$pid] with exitcode [$retval]. "DEBUG"
if [ "$WAIT_FOR_TASK_COMPLETION" == "" ]; then
WAIT_FOR_TASK_COMPLETION="$pid:$retval"
else
WAIT_FOR_TASK_COMPLETION=";$pid:$retval"
fi
fi
fi
fi
done
pidsArray=("${newPidsArray[@]}")
# Trivial wait time for bash to not eat up all CPU
sleep .05
done
# Return exit code if only one process was monitored, else return number of errors
if [ $pidCount -eq 1 ] && [ $errorcount -eq 0 ]; then
return $errorcount
else
return $errorcount
fi
}
Usage:
Let's take 3 sleep jobs, get their pids and send them to WaitforTaskCompletion:
sleep 10 &
pids="$!"
sleep 15 &
pids="$pids;$!"
sleep 20 &
pids="$pids;$!"
WaitForTaskCompletion $pids 1800 3600 ${FUNCNAME[0]} false 1800
The prior example would warn you if execution takes more than 1 hour, stop execution after 2 hours, and send a "alive" log message every half hour.