K8SPSMDB-1080 - Use trap to catch exit status

tplavcic · tplavcic · commit 7a70e0de5ec0 · 2024-06-27T13:06:13.000+02:00
diff --git a/e2e-tests/arbiter/run b/e2e-tests/arbiter/run
@@ -31,7 +31,6 @@ check_cr_config() {
 	if [[ $(kubectl_bin get pod \
 		--selector=statefulset.kubernetes.io/pod-name="${cluster}-arbiter-0" \
 		-o jsonpath='{.items[*].status.containerStatuses[?(@.name == "mongod-arbiter")].restartCount}') -gt 0 ]]; then
-		collect_k8s_logs
 		echo "Something went wrong with arbiter. Exiting..."
 		exit 1
 	fi
diff --git a/e2e-tests/balancer/run b/e2e-tests/balancer/run
@@ -15,7 +15,6 @@ check_balancer() {
 		| grep -E -v "Percona Server for MongoDB|connecting to:|Implicit session:|versions do not match|Error saving history file:|bye")
 
 	if [[ $balancer_running != "$expected" ]]; then
-		collect_k8s_logs
 		echo "Unexpected output from \"db.adminCommand({balancerStatus: 1}).mode\": $balancer_running"
 		echo "Expected $expected"
 		exit 1
diff --git a/e2e-tests/cross-site-sharded/run b/e2e-tests/cross-site-sharded/run
@@ -101,7 +101,6 @@ for i in "rs0" "rs1"; do
 done
 
 if [[ $shards -lt 2 ]]; then
-	collect_k8s_logs
 	echo "data is only on some of the shards, maybe sharding is not working"
 	exit 1
 fi
diff --git a/e2e-tests/data-at-rest-encryption/run b/e2e-tests/data-at-rest-encryption/run
@@ -83,7 +83,6 @@ encrypted_cluster_log=$(kubectl_bin logs some-name-rs0-0 -c mongod -n $namespace
 
 echo "$encrypted_cluster_log"
 if [ -z "$encrypted_cluster_log" ]; then
-	collect_k8s_logs
 	echo "Cluster is not encrypted"
 	exit 1
 fi
@@ -100,7 +99,6 @@ until [ "$retry" -ge 10 ]; do
 		echo "Cluster is not encrypted already"
 		break
 	elif [ $retry == 15 ]; then
-		collect_k8s_logs
 		echo "Max retry count $retry reached. Cluster is still encrypted"
 		exit 1
 	else
diff --git a/e2e-tests/data-sharded/run b/e2e-tests/data-sharded/run
@@ -17,7 +17,6 @@ check_rs_proper_component_deletion() {
 	until [[ $(kubectl_bin get sts -l app.kubernetes.io/instance=${cluster},app.kubernetes.io/replset=${rs_name} -ojson | jq '.items | length') -eq 0 ]]; do
 		let retry+=1
 		if [ $retry -ge 70 ]; then
-			collect_k8s_logs
 			sts_count=$(kubectl_bin get sts -l app.kubernetes.io/instance=${cluster},app.kubernetes.io/replset=${rs_name} -ojson | jq '.items | length')
 			echo "Replset $rs_name not properly removed, expected sts count of 0 but got $sts_count. Exiting after $retry tries..."
 			exit 1
@@ -116,7 +115,6 @@ main() {
 	done
 
 	if [[ $shards -lt 3 ]]; then
-		collect_k8s_logs
 		echo "data is only on some of the shards, maybe sharding is not working"
 		exit 1
 	fi
@@ -127,7 +125,6 @@ main() {
 		"clusterAdmin:clusterAdmin123456@$cluster-mongos.$namespace" "mongodb" ".svc.cluster.local" \
 		"--tlsCertificateKeyFile /tmp/tls.pem --tlsCAFile /etc/mongodb-ssl/ca.crt --tls")
 	if ! echo $res | grep -q '"ok" : 1'; then
-		collect_k8s_logs
 		echo "app database not dropped. Exiting.."
 		exit 1
 	fi
diff --git a/e2e-tests/default-cr/run b/e2e-tests/default-cr/run
@@ -27,7 +27,6 @@ function stop_cluster() {
 		let passed_time="${passed_time}+${sleep_time}"
 		sleep ${sleep_time}
 		if [[ ${passed_time} -gt ${max_wait_time} ]]; then
-			collect_k8s_logs
 			echo "We've been waiting for cluster stop for too long. Exiting..."
 			exit 1
 		fi
diff --git a/e2e-tests/demand-backup-physical-sharded/run b/e2e-tests/demand-backup-physical-sharded/run
@@ -38,7 +38,6 @@ run_recovery_check() {
 	wait_restore "${backup_name}" "${cluster}" "ready" "0" "1800"
 	kubectl_bin get psmdb ${cluster} -o yaml
 	if [ $(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"') == null ]; then
-		collect_k8s_logs
 		echo "psmdb/${cluster} should be annotated with percona.com/resync-pbm after a physical restore"
 		exit 1
 	fi
@@ -53,7 +52,6 @@ check_exported_mongos_service_endpoint() {
 	local host=$1
 
 	if [ "$host" != "$(kubectl_bin get psmdb $cluster -o=jsonpath='{.status.host}')" ]; then
-		collect_k8s_logs
 		echo "Exported host is not correct after the restore"
 		exit 1
 	fi
@@ -82,7 +80,6 @@ wait_cluster_consistency ${cluster}
 lbEndpoint=$(kubectl_bin get svc $cluster-mongos -o=jsonpath='{.status}' |
 	jq -r 'select(.loadBalancer != null and .loadBalancer.ingress != null and .loadBalancer.ingress != []) | .loadBalancer.ingress[0][]')
 if [ -z $lbEndpoint ]; then
-	collect_k8s_logs
 	echo "mongos service not exported correctly"
 	exit 1
 fi
diff --git a/e2e-tests/demand-backup-physical/run b/e2e-tests/demand-backup-physical/run
@@ -38,7 +38,6 @@ run_recovery_check() {
 	wait_restore "${backup_name}" "${cluster}" "ready" "0" "1800"
 	kubectl_bin get psmdb ${cluster} -o yaml
 	if [ $(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"') == null ]; then
-		collect_k8s_logs
 		echo "psmdb/${cluster} should be annotated with percona.com/resync-pbm after a physical restore"
 		exit 1
 	fi
diff --git a/e2e-tests/demand-backup-sharded/run b/e2e-tests/demand-backup-sharded/run
@@ -166,7 +166,6 @@ backup_exists=$(kubectl_bin run -i --rm aws-cli --image=perconalab/awscli --rest
 	/usr/bin/aws --endpoint-url http://minio-service:9000 s3 ls s3://operator-testing/ \
 	| grep -c ${backup_dest_minio}_ | cat)
 if [[ $backup_exists -eq 1 ]]; then
-	collect_k8s_logs
 	echo "Backup was not removed from bucket -- minio"
 	exit 1
 fi
diff --git a/e2e-tests/demand-backup/run b/e2e-tests/demand-backup/run
@@ -135,7 +135,6 @@ backup_exists=$(kubectl_bin run -i --rm aws-cli --image=perconalab/awscli --rest
 	/usr/bin/aws --endpoint-url http://minio-service:9000 s3 ls s3://operator-testing/ \
 	| grep -c ${backup_dest_minio} | cat)
 if [[ $backup_exists -eq 1 ]]; then
-	collect_k8s_logs
 	echo "Backup was not removed from bucket -- minio"
 	exit 1
 fi
@@ -171,7 +170,6 @@ backup_exists=$(kubectl_bin run -i --rm aws-cli --image=perconalab/awscli --rest
 	/usr/bin/aws --endpoint-url http://minio-service:9000 s3 ls s3://operator-testing/ \
 	| grep -c ${backup_dest_minio} | cat)
 if [[ $backup_exists -eq 1 ]]; then
-	collect_k8s_logs
 	echo "Backup was not removed from bucket -- minio"
 	exit 1
 fi
diff --git a/e2e-tests/expose-sharded/run b/e2e-tests/expose-sharded/run
@@ -23,7 +23,6 @@ function stop_cluster() {
 		let passed_time="${passed_time}+${sleep_time}"
 		sleep ${passed_time}
 		if [[ ${passed_time} -gt ${max_wait_time} ]]; then
-			collect_k8s_logs
 			echo "We've been waiting for cluster stop for too long. Exiting..."
 			exit 1
 		fi
@@ -53,7 +52,6 @@ function compare_mongo_config() {
 	rs0_0_endpoint_actual=$(run_mongo 'var host;var x=0;rs.conf().members.forEach(function(d){ if(d.tags.podName=="some-name-rs0-0"){ host=rs.conf().members[x].host;print(host)};x=x+1; })' "clusterAdmin:clusterAdmin123456@${cluster}-rs0.${namespace}" | egrep -v 'I NETWORK|W NETWORK|Error saving history file|Percona Server for MongoDB|connecting to:|Unable to reach primary for set|Implicit session:|versions do not match|Error saving history file:|bye')
 
 	if [[ $rs0_0_endpoint_actual != "$rs0_0_endpoint:27017" || $cfg_0_endpoint_actual != "$cfg_0_endpoint:27017" ]]; then
-		collect_k8s_logs
 		desc "Actual values rs $rs0_0_endpoint_actual and cfg $cfg_0_endpoint_actual do not match expected rs $rs0_0_endpoint:27017 and cfg $cfg_0_endpoint:27017"
 		exit 1
 	fi
diff --git a/e2e-tests/functions b/e2e-tests/functions
@@ -28,6 +28,15 @@ conf_dir=$(realpath $test_dir/../conf || :)
 src_dir=$(realpath $test_dir/../..)
 logs_dir=$(realpath $test_dir/../logs)
 
+trap cleanup EXIT HUP INT QUIT TERM
+cleanup() {
+	exit_code=$?
+	if [[ ${exit_code} -ne 0 ]]; then
+		collect_k8s_logs
+	fi
+	exit ${exit_code}
+}
+
 if [[ ${ENABLE_LOGGING} == "true" ]]; then
 	if [ ! -d "${logs_dir}" ]; then
 		mkdir "${logs_dir}"
@@ -150,7 +159,6 @@ wait_pod() {
 		echo -n .
 		let retry+=1
 		if [ $retry -ge 360 ]; then
-			collect_k8s_logs
 			kubectl_bin describe pod/$pod
 			kubectl_bin logs $pod
 			kubectl_bin logs ${OPERATOR_NS:+-n $OPERATOR_NS} $(get_operator_pod) \
@@ -179,7 +187,6 @@ wait_cron() {
 		echo -n .
 		let retry+=1
 		if [ $retry -ge 360 ]; then
-			collect_k8s_logs
 			kubectl_bin logs ${OPERATOR_NS:+-n $OPERATOR_NS} $(get_operator_pod) \
 				| grep -v 'level=info' \
 				| grep -v 'level=debug' \
@@ -205,7 +212,6 @@ wait_backup_agent() {
 		echo -n .
 		let retry+=1
 		if [ $retry -ge 360 ]; then
-			collect_k8s_logs
 			kubectl_bin logs $agent_pod -c backup-agent \
 				| tail -100
 
@@ -230,7 +236,6 @@ wait_backup() {
 		let retry+=1
 		current_status=$(kubectl_bin get psmdb-backup $backup_name -o jsonpath='{.status.state}')
 		if [[ $retry -ge 360 || ${current_status} == 'error' ]]; then
-			collect_k8s_logs
 			kubectl_bin logs ${OPERATOR_NS:+-n $OPERATOR_NS} $(get_operator_pod) \
 				| grep -v 'level=info' \
 				| grep -v 'level=debug' \
@@ -291,7 +296,6 @@ wait_deployment() {
 		echo -n .
 		let retry+=1
 		if [ $retry -ge 360 ]; then
-			collect_k8s_logs
 			kubectl_bin logs ${OPERATOR_NS:+-n $OPERATOR_NS} $(get_operator_pod) \
 				| grep -v 'level=info' \
 				| grep -v 'level=debug' \
@@ -339,7 +343,6 @@ wait_restore() {
 		let retry+=1
 		current_state=$(kubectl_bin get psmdb-restore restore-$backup_name -o jsonpath='{.status.state}')
 		if [[ $retry -ge $wait_time || ${current_state} == 'error' ]]; then
-			collect_k8s_logs
 			kubectl_bin logs ${OPERATOR_NS:+-n $OPERATOR_NS} $(get_operator_pod) \
 				| grep -v 'level=info' \
 				| grep -v 'level=debug' \
@@ -553,7 +556,6 @@ retry() {
 
 	until "$@"; do
 		if [[ $n -ge $max ]]; then
-			collect_k8s_logs
 			echo "The command '$@' has failed after $n attempts."
 			exit 1
 		fi
@@ -593,7 +595,6 @@ wait_for_running() {
 			timeout=$((timeout + 1))
 			echo -n '.'
 			if [[ ${timeout} -gt 1500 ]]; then
-				collect_k8s_logs
 				echo
 				echo "Waiting timeout has been reached. Exiting..."
 				exit 1
@@ -616,7 +617,6 @@ wait_for_delete() {
 		echo -n .
 		let retry+=1
 		if [ $retry -ge $wait_time ]; then
-			collect_k8s_logs
 			kubectl logs ${OPERATOR_NS:+-n $OPERATOR_NS} $(get_operator_pod) \
 				| grep -v 'level=info' \
 				| grep -v 'level=debug' \
@@ -639,8 +639,6 @@ compare_generation() {
 
 	current_generation="$(kubectl_bin get ${resource_type} "${resource_name}" -o jsonpath='{.metadata.generation}')"
 	if [[ ${generation} != "${current_generation}" ]]; then
-		collect_k8s_logs
-
 		echo "Generation for ${resource_type}/${resource_name} is: ${current_generation}, but should be: ${generation}"
 		exit 1
 	fi
@@ -1011,7 +1009,6 @@ get_service_endpoint() {
 		return
 	fi
 
-	collect_k8s_logs
 	exit 1
 }
 
@@ -1150,9 +1147,6 @@ kubectl_bin() {
 	cat "$LAST_OUT"
 	cat "$LAST_ERR" >&2
 	rm "$LAST_OUT" "$LAST_ERR"
-	if [ ${exit_status} != 0 ]; then
-		collect_k8s_logs
-	fi
 	return ${exit_status}
 }
 
@@ -1191,7 +1185,6 @@ wait_cluster_consistency() {
 	until [[ "$(kubectl_bin get psmdb "${cluster_name}" -o jsonpath='{.status.state}')" == "ready" ]]; do
 		let retry+=1
 		if [ $retry -ge $wait_time ]; then
-			collect_k8s_logs
 			echo max retry count $retry reached. something went wrong with operator or kubernetes cluster
 			exit 1
 		fi
@@ -1218,7 +1211,6 @@ check_backup_deletion() {
 	retry=0
 	until [[ $(curl -sw '%{http_code}' -o /dev/null $path) -eq 403 ]] || [[ $(curl -sw '%{http_code}' -o /dev/null $path) -eq 404 ]]; do
 		if [ $retry -ge 10 ]; then
-			collect_k8s_logs
 			echo max retry count $retry reached. something went wrong with operator or kubernetes cluster
 			echo "Backup was not removed from bucket -- $storage_name"
 			exit 1
@@ -1280,7 +1272,6 @@ function get_mongod_ver_from_image() {
 	version_info=$(run_simple_cli_inside_image ${image} 'mongod --version' | $sed -r 's/^.*db version v(([0-9]+\.){2}[0-9]+-[0-9]+).*$/\1/g')
 
 	if [[ ! ${version_info} =~ ^([0-9]+\.){2}[0-9]+-[0-9]+$ ]]; then
-		collect_k8s_logs
 		printf "No mongod version obtained from %s. Exiting" ${image}
 		exit 1
 	fi
@@ -1293,7 +1284,6 @@ function get_pbm_version() {
 	local version_info=$(run_simple_cli_inside_image ${image} 'pbm-agent version' | $sed -r 's/^Version:\ (([0-9]+\.){2}[0-9]+)\ .*/\1/g')
 
 	if [[ ! ${version_info} =~ ^([0-9]+\.){2}[0-9]+$ ]]; then
-		collect_k8s_logs
 		printf "No pbm version obtained from %s. Exiting" ${image}
 		exit 1
 	fi
diff --git a/e2e-tests/init-deploy/run b/e2e-tests/init-deploy/run
@@ -61,7 +61,6 @@ compare_mongo_cmd "find" "myApp:myPass@$cluster-2.$cluster.$namespace"
 desc 'check number of connections'
 conn_count=$(run_mongo 'db.serverStatus().connections.current' "clusterAdmin:clusterAdmin123456@$cluster.$namespace" | egrep -v 'I NETWORK|W NETWORK|Error saving history file|Percona Server for MongoDB|connecting to:|Unable to reach primary for set|Implicit session:|versions do not match|bye')
 if [ ${conn_count} -gt ${max_conn} ]; then
-	collect_k8s_logs
 	echo "Mongo connection count ${conn_count} is greater than maximum connection count limit: ${max_conn}"
 	exit 1
 fi
diff --git a/e2e-tests/mongod-major-upgrade-sharded/run b/e2e-tests/mongod-major-upgrade-sharded/run
@@ -94,7 +94,6 @@ function main() {
 			| grep -E '^\{.*\}$' | jq -r '.featureCompatibilityVersion.version')
 
 		if [[ ${currentFCV} != ${version} ]]; then
-			collect_k8s_logs
 			echo "FCV at the moment is ${currentFCV} and is not set to ${version} as it should. Exiting..."
 			exit 1
 		fi
diff --git a/e2e-tests/mongod-major-upgrade/run b/e2e-tests/mongod-major-upgrade/run
@@ -89,7 +89,6 @@ function main() {
 			| grep -E '^\{.*\}$' | jq -r '.featureCompatibilityVersion.version')
 
 		if [[ ${currentFCV} != ${version} ]]; then
-			collect_k8s_logs
 			echo "FCV at the moment is ${currentFCV} and is not set to ${version} as it should. Exiting..."
 			exit 1
 		fi
diff --git a/e2e-tests/monitoring-2-0/run b/e2e-tests/monitoring-2-0/run
@@ -37,7 +37,6 @@ until kubectl_bin exec monitoring-0 -- bash -c "ls -l /proc/*/exe 2>/dev/null| g
 	sleep 5
 	let retry+=1
 	if [ $retry -ge 20 ]; then
-		collect_k8s_logs
 		echo "Max retry count $retry reached. Pmm-server can't start"
 		exit 1
 	fi
@@ -151,7 +150,6 @@ if [[ -n ${OPENSHIFT} ]]; then
 fi
 
 if [[ $(kubectl_bin logs monitoring-rs0-0 pmm-client | grep -c 'cannot auto discover databases and collections') != 0 ]]; then
-	collect_k8s_logs
 	echo "error: cannot auto discover databases and collections"
 	exit 1
 fi
diff --git a/e2e-tests/multi-cluster-service/run b/e2e-tests/multi-cluster-service/run
@@ -23,7 +23,6 @@ wait_mcs_api() {
 	until [[ $(kubectl_bin api-resources | grep ServiceExport | wc -l) -eq 1 ]]; do
 		let retry+=1
 		if [ $retry -ge 64 ]; then
-			collect_k8s_logs
 			echo max retry count $retry reached. Something went wrong with MCS, probably a problem on GCP side.
 			exit 1
 		fi
@@ -41,7 +40,6 @@ wait_service_import() {
 	until [[ "$(kubectl_bin get serviceimport --ignore-not-found | grep -v 'NAME' | wc -l)" -eq "9" ]]; do
 		let retry+=1
 		if [ $retry -ge 64 ]; then
-			collect_k8s_logs
 			echo max retry count $retry reached. Something went wrong with MCS, probably a problem in gke-mcs-importer.
 			exit 1
 		fi
@@ -60,7 +58,6 @@ wait_service_export() {
 	until [[ "$(kubectl_bin get serviceexport --ignore-not-found | grep -v 'NAME' | wc -l)" -eq "9" ]]; do
 		let retry+=1
 		if [ $retry -ge 64 ]; then
-			collect_k8s_logs
 			echo max retry count $retry reached. Something went wrong with MCS, probably a problem in gke-mcs-exporter.
 			exit 1
 		fi
diff --git a/e2e-tests/one-pod/compare/statefulset_one-pod-rs0.yml b/e2e-tests/one-pod/compare/statefulset_one-pod-rs0.yml
@@ -17,7 +17,7 @@ metadata:
       name: one-pod
 spec:
   podManagementPolicy: OrderedReady
-  replicas: 1
+  replicas: 2
   revisionHistoryLimit: 10
   selector:
     matchLabels:
diff --git a/e2e-tests/rs-shard-migration/run b/e2e-tests/rs-shard-migration/run
@@ -38,12 +38,10 @@ function main() {
 	wait_cluster_consistency "${cluster}"
 
 	if [[ $(kubectl_bin get statefulset/${cluster}-mongos -o jsonpath='{.status.readyReplicas}') -lt 1 ]]; then
-		collect_k8s_logs
 		echo "Mongos hasn't been properly started. Exiting..."
 		exit 1
 	fi
 	if [[ "$(kubectl_bin get sts/${cluster}-cfg -o jsonpath='{.status.replicas}')" != "$(kubectl_bin get sts/${cluster}-cfg -o jsonpath='{.status.readyReplicas}')" ]]; then
-		collect_k8s_logs
 		echo "Cfg pods haven't been properly started. Exiting..."
 		exit 1
 	fi
@@ -56,7 +54,6 @@ function main() {
 
 	if [[ -z "$(get_shard_parameter ${cluster} ${namespace} lastCommitedOpTime)" ]] \
 		&& [[ -z "$(get_shard_parameter ${cluster} ${namespace} '$configServerState.opTime.ts')" ]]; then # for mongo 3.6
-		collect_k8s_logs
 		echo "Sharded cluster does not work properly"
 		exit 1
 	fi
@@ -73,7 +70,6 @@ function main() {
 		|| [[ -n "$(kubectl_bin get service -o jsonpath='{.items[?(@.metadata.name == "'"${cluster}-mongos"'")].metadata.name}')" ]] \
 		|| [[ -n "$(kubectl_bin get service -o jsonpath='{.items[?(@.metadata.name == "'"${cluster}-cfg"'")].metadata.name}')" ]] \
 		|| [[ -n "$(kubectl_bin get statefulset -o jsonpath='{.items[?(@.metadata.name == "'"${cluster}-cfg"'")].metadata.name}')" ]]; then
-		collect_k8s_logs
 		echo "Transition to replicaset cluster has not been done well. Cluster does not work properly or some leftovers still exist"
 		exit 1
 	fi
diff --git a/e2e-tests/self-healing-chaos/run b/e2e-tests/self-healing-chaos/run
diff --git a/e2e-tests/service-per-pod/run b/e2e-tests/service-per-pod/run
diff --git a/e2e-tests/smart-update/run b/e2e-tests/smart-update/run
diff --git a/e2e-tests/split-horizon/run b/e2e-tests/split-horizon/run
diff --git a/e2e-tests/tls-issue-cert-manager/run b/e2e-tests/tls-issue-cert-manager/run
diff --git a/e2e-tests/upgrade-sharded/run b/e2e-tests/upgrade-sharded/run
diff --git a/e2e-tests/upgrade/run b/e2e-tests/upgrade/run
diff --git a/e2e-tests/version-service/run b/e2e-tests/version-service/run