Below is a shell script which I am using as user-data.sh
The issue is cloud formation stack is not getting a "SUCCESS" signal and therefore it goes to ROLLBACK_COMPLETE status.
#!/bin/bash -x
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
echo "[START] UserData started at $(date +"%d-%m-%Y %H:%M")"
# Install required packages
sudo yum-config-manager --setopt=proxy=http://app-proxy:3128 --save
export no_proxy=".xyz"
# Retrieve service account & GitHub credentials from secret manager
USERNAME=$(aws secretsmanager get-secret-value --secret-id smtp-username --query 'SecretString' --output text --region ap-southeast-2)
API_TOKEN=$(aws secretsmanager get-secret-value --secret-id smtp-password --query 'SecretString' --output text --region ap-southeast-2)
GITHUB_USERNAME=$(aws secretsmanager get-secret-value --secret-id github-username --query 'SecretString' --output text --region ap-southeast-2)
GITHUB_TOKEN=$(aws secretsmanager get-secret-value --secret-id github-token --query 'SecretString' --output text --region ap-southeast-2)
# Ensure both credentials input
if [ -z "$USERNAME" ]; then
echo "Please provide smtp-username."
exit 1
fi
if [ -z "$API_TOKEN" ]; then
echo "Please provide smtp-password."
exit 1
fi
# Ensure both credentials input
if [ -z "$GITHUB_USERNAME" ]; then
echo "Please provide github-username."
exit 1
fi
if [ -z "$GITHUB_TOKEN" ]; then
echo "Please provide github-token."
exit 1
fi
# Ensure current user is root
if (( EUID !=0 )); then
echo "Please run as root!"
exit
fi
# Check if user albert exits
if grep -q "albert" /etc/passwd;
then
echo "user albert exists" >> ${LOG_FILE}
else
useradd albert
fi
# Installing python & pip
echo "Installing python3, pip3"
#sudo yum update -y # For testing purpose so that I don't have to wait for a long time.
sudo yum install python3 python3-pip
sudo yum install git -y
# Open Port 8443 for sample Application
setenforce 0
firewall-cmd --query-port=8443/tcp
firewall-cmd --add-port=8443/tcp --permanent
firewall-cmd --reload
firewall-cmd --query-port=8443/tcp
setenforce 1
#firewall-cmd --query-port=8443/tcp
# Creating directory and grant albert to be owner
echo "Creating directory"
mkdir -p /var/log/pandhu
chown -R albert: albert /var/log/pandhu
chmod 744 /usr/local/lib/
mkdir /app
cd /app/
# Git clone app & kafka repos
git clone https://${GITHUB_USERNAME}:${GITHUB_TOKEN}@github.source.internal.xyz/DigitalT/sample.git
git clone https://${GITHUB_USERNAME}:${GITHUB_TOKEN}@github.source.internal.xyz/DigitalT/kafka.git
chown -R albert: albert /app/
# Retrieve sample application SSL public key & private key
echo "Retrieve app SSL credentials"
cd /app/sample/ssl/
ssl_cert=$(aws secretsmanager get-secret-value --secret-id app_test_private_key --query 'SecretString' --output text --region ap-southeast-2 | base64 -d)
ssl_key=$(aws secretsmanager get-secret-value --secret-id app_test_public_key --query 'SecretString' --output text --region ap-southeast-2)
echo "${ssl_cert}" > applb.test.confclient.com.crt
echo "${ssl_key}" > applbcerttest.key
# allow SSL credentials able to be read by user albert
chmod 744 /app/sample/ssl/*
# copy SSL file to root directory
cp -apvf /app/sample/ssl/applb.test.confclient.com.crt /etc/pki/tls/certs/applb.test.confclient.com.crt
cp -apvf /app/sample/ssl/applbcerttest.key /etc/pki/tls/private/applbcerttest.key
# Run scripts in user albert environment
su -l albert <<'EOF'
user=albert
if [ "$user" != "albert" ]; then
echo "user is not albert"
exit 1
fi
echo "user is albert"
# Install app packages in virtual environment
echo "Creating app environement"
cd /app/sample
python3 -m venv env
echo "Activating app environement ..."
source env/bin/activate
echo "Installing app packages from pypi ..."
pip install -r /app/sample/requirements.txt --no-cache-dir --index-url https://artifactory.internal.xyz/artifactory/api/pypi/org.python.pypi/simple
# Install Kafka packages in virtual environment
echo "Creating Kafka environement"
cd /app/kafka
python3 -m venv env
echo "Activating Kafka environement"
source env/bin/activate
echo "Installing kafka packages"
pip install -r /app/kafka/requirements.txt --no-cache-dir --index-url https://artifactory.internal.xyz/artifactory/api/pypi/org.python.pypi/simple
# Retieve dhondhu credentials from secret manager
echo "Retrieve dhondhu credentials"
cd /app/kafka/dhondhu/etc/
ca_key=$(aws secretsmanager get-secret-value --secret-id ca-pem --query 'SecretString' --output text --region ap-southeast-2 | base64 -d)
cybercrime_key=$(aws secretsmanager get-secret-value --secret-id cybercrime-key --query 'SecretString' --output text --region ap-southeast-2)
aws secretsmanager get-secret-value --secret-id cybercrime-pem --query 'SecretString' --output text --region ap-southeast-2 | base64 -d > cybercrime.pem
dhondhutelemetry_key=$(aws secretsmanager get-secret-value --secret-id dhondhutelemetry-key --query 'SecretString' --output text --region ap-southeast-2 | base64 -d)
aws secretsmanager get-secret-value --secret-id dhondhutelemetry-pem --query 'SecretString' --output text --region ap-southeast-2 | base64 -d > dhondhutelemetry.pem
splunk_token=$(aws secretsmanager get-secret-value --secret-id splunk-api-token --query SecretString --output text --region ap-southeast-2)
# Inject splunk token value to relay.cf
sed -i 's/\!splunk-api-token!/'"${splunk_token}"'/' /app/kafka/dhondhu/relay.cf
# Save dhondhu SSL credentials to file
echo "${ca_key}" > ca.pem
echo "${cybercrime_key}" > cybercrime. Key
echo "${dhondhutelemetry_key}" > dhondhutelemetry.key
echo "Retrieve pandhu credentials from secret manager"
cd /app/kafka/pandhu/etc
trusted_pem=$(aws secretsmanager get-secret-value --secret-id pandhu-trusted-pem --query 'SecretString' --output text --region ap-southeast-2 | base64 -d)
aws secretsmanager get-secret-value --secret-id kafka-prod-cer --query 'SecretString' --output text --region ap-southeast-2 | base64 -d > kafka_prod.cer
aws secretsmanager get-secret-value --secret-id kafka-prod-key --query 'SecretString' --output text --region ap-southeast-2 | base64 -d > kafka_prod.key
# Save pandhu SSL credentials to file
echo "${trusted_pem}" > trusted.pem
# Run server.py
echo "Run app application"
cd /app/sample; source env/bin/activate; if ! (pgrep -a "python" | grep -v "server.py"); then python server.py 8443; fi
# Run kafka scripts
echo "Run kafka and splunk"
source /app/kafka/env/bin/activate; cd /app/kafka/dhondhu; if [ -z $(pgrep -a "python" | grep "relay") ]; then python relay.py; fi
source /app/kafka/env/bin/activate; cd /app/kafka/pandhu; if [ -z $(pgrep -a "python" | grep "ingest2") ]; then python run/ingest2.py; fi
# schedule cron job
echo "Scheduling cron job"
cat /app/sample/albert. Crontab | crontab -
# List cron jobs
crontab -l
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id)
REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/placement/region)
STACK_NAME=$(aws cloudformation describe-stack-resources \
--physical-resource-id $INSTANCE_ID \
--query 'StackResources[0].StackName' \
--output text)
function signal_cfn() {
echo "I am in signal_cfn function"
STACK_STATUS=$(aws cloudformation describe-stacks \
--stack-name $STACK_NAME \
--query 'Stacks[0].StackStatus' \
--output text)
if [[ "$STACK_STATUS" == "CREATE_IN_PROGRESS" || "$STACK_STATUS" == "UPDATE_IN_PROGRESS" ]]; then
echo "Instance being created by CloudFormation, signalling $1."
echo "${1}"
aws cloudformation signal-resource \
--stack-name $STACK_NAME \
--region $REGION \
--logical-resource-id AutoScalingGroup \
--unique-id $INSTANCE_ID \
--status $1
else
echo "Not signalling to cfn as instance provisioning wasn't kicked off by CloudFormation."
fi
}
function wait_for_lb_health_check_to_pass() {
echo "I am in Load Balancer function"
local max_attempts=10
local wait_time_in_seconds=30
local attempt=0
_instance_health_status="unhealthy"
_target_group_arn_parameter_key="LBTargetGroupArn"
_target_group_arn=$(aws cloudformation describe-stacks \
--stack-name $STACK_NAME \
--query "Stacks[0].Parameters[?ParameterKey == '${_target_group_arn_parameter_key}'].ParameterValue" \
--output text)
echo "Target group ARN: ${_target_group_arn}"
while [[ $attempt -lt $max_attempts && $_instance_health_status != "healthy" ]]
do
sleep $wait_time_in_seconds
attempt=$(( attempt + 1))
wait_time_in_seconds=$(( wait_time_in_seconds * 2 ))
_instance_health_status=$(aws elbv2 describe-target-health \
--target-group-arn $_target_group_arn \
--targets Id=$INSTANCE_ID,Port=8443 \
--query "TargetHealthDescriptions[0].TargetHealth.State" \
--output text)
echo "Instance health status: ${_instance_health_status}"
if [[ $_instance_health_status == "healthy" ]]; then
echo "Instance ${INSTANCE_ID} is healthy now"
break
fi
done
if [[ $_instance_health_status != "healthy" ]]; then
signal_cfn FAILURE
exit 1;
fi
echo "load balancer function completed"
}
trap "signal_cfn FAILURE" ERR
APP_RUNNING=$(pgrep -u albert -f "server.py")
if [ -z "${APP_RUNNING}" ] ; then
signal_cfn FAILURE
else
signal_cfn SUCCESS
fi
echo "Script complete."
wait_for_lb_health_check_to_pass
echo "app ALB Health Checks have passed. Signal SUCCESS to cfn."
echo "[FINISH] UserData finished at $(date +"%d-%m-%Y %H:%M")"
EOF
# TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
# INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id)
# REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/placement/region)
# STACK_NAME=$(aws cloudformation describe-stack-resources \
# --physical-resource-id $INSTANCE_ID \
# --query 'StackResources[0].StackName' \
# --output text)
# function signal_cfn() {
# echo "I am in signal_cfn function"
# STACK_STATUS=$(aws cloudformation describe-stacks \
# --stack-name $STACK_NAME \
# --query 'Stacks[0].StackStatus' \
# --output text)
# if [[ "$STACK_STATUS" == "CREATE_IN_PROGRESS" || "$STACK_STATUS" == "UPDATE_IN_PROGRESS" ]]; then
# echo "Instance being created by CloudFormation, signalling $1."
# echo "${1}"
# aws cloudformation signal-resource \
# --stack-name $STACK_NAME \
# --region $REGION \
# --logical-resource-id AutoScalingGroup \
# --unique-id $INSTANCE_ID \
# --status $1
# else
# echo "Not signalling to cfn as instance provisioning wasn't kicked off by CloudFormation."
# fi
# }
# function wait_for_lb_health_check_to_pass() {
# echo "I am in Load Balancer function"
# local max_attempts=10
# local wait_time_in_seconds=30
# local attempt=0
# _instance_health_status="unhealthy"
# _target_group_arn_parameter_key="LBTargetGroupArn"
# _target_group_arn=$(aws cloudformation describe-stacks \
# --stack-name $STACK_NAME \
# --query "Stacks[0].Parameters[?ParameterKey == '${_target_group_arn_parameter_key}'].ParameterValue" \
# --output text)
# echo "Target group ARN: ${_target_group_arn}"
# while [[ $attempt -lt $max_attempts && $_instance_health_status != "healthy" ]]
# do
# sleep $wait_time_in_seconds
# attempt=$(( attempt + 1))
# wait_time_in_seconds=$(( wait_time_in_seconds * 2 ))
# _instance_health_status=$(aws elbv2 describe-target-health \
# --target-group-arn $_target_group_arn \
# --targets Id=$INSTANCE_ID,Port=8443 \
# --query "TargetHealthDescriptions[0].TargetHealth.State" \
# --output text)
# echo "Instance health status: ${_instance_health_status}"
# if [[ $_instance_health_status == "healthy" ]]; then
# echo "Instance ${INSTANCE_ID} is healthy now"
# break
# fi
# done
# if [[ $_instance_health_status != "healthy" ]]; then
# signal_cfn FAILURE
# exit 1;
# fi
# echo "load balancer function completed"
# }
# trap "signal_cfn FAILURE" ERR
# APP_RUNNING=$(pgrep -u albert -f "server.py")
# if [ -z "${APP_RUNNING}" ] ; then
# signal_cfn FAILURE
# else
# signal_cfn SUCCESS
# fi
# echo "Script complete."
# wait_for_lb_health_check_to_pass
# echo "app ALB Health Checks have passed. Signal SUCCESS to cfn."
# echo "[FINISH] UserData finished at $(date +"%d-%m-%Y %H:%M")"
Error Message:
The following resource(s) failed to create: [AutoScalingGroup]. Rollback requested by user.
Failed to receive 1 resource signal(s) for the current batch. Each resource signal timeout is counted as a FAILURE.
Received 0 SUCCESS signal(s) out of 1. Unable to satisfy 100% MinSuccessfulInstancesPercent requirement
I did try to incorporate the code from TOKEN till echo "[FINISH]..." AFTER EOF, what does happen is that I go to the VM and execute the cfn signal bit and the stack is created.
Unfortunately that will not work as again one has to do SSH and then execute last cfn_signal bit manually.
I am pretty confused what is the issue that is not sending SUCCESS signal back to CloudFormation API.