diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index 9ebcef558..d54c52cb0 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -12,6 +12,10 @@ import ( "github.com/argoproj/gitops-engine/pkg/utils/kube" ) +const ( + AnnotationIgnoreRestartPolicy = "argocd.argoproj.io/ignore-restart-policy" +) + func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { gvk := obj.GroupVersionKind() switch gvk { @@ -93,9 +97,9 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { } return &HealthStatus{Status: HealthStatusDegraded, Message: ""}, nil + case corev1.PodRunning: - switch pod.Spec.RestartPolicy { - case corev1.RestartPolicyAlways: + getHealthStatus := func(pod *corev1.Pod) (*HealthStatus, error) { // if pod is ready, it is automatically healthy if podutils.IsPodReady(pod) { return &HealthStatus{ @@ -117,10 +121,18 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { Status: HealthStatusProgressing, Message: pod.Status.Message, }, nil - case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: - // pods set with a restart policy of OnFailure or Never, have a finite life. + } + policy := pod.Spec.RestartPolicy + if _, ok := pod.Annotations[AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways { + return getHealthStatus(pod) + } + + if policy == corev1.RestartPolicyOnFailure || policy == corev1.RestartPolicyNever { + // Most pods set with a restart policy of OnFailure or Never, have a finite life. // These pods are typically resource hooks. Thus, we consider these as Progressing - // instead of healthy. + // instead of healthy. If this is unwanted, e.g., when the pod is managed by an + // operator and therefore has a restart policy of OnFailure or Never, then use the + // the AnnotationIgnoreRestartPolicy annotation. return &HealthStatus{ Status: HealthStatusProgressing, Message: pod.Status.Message, diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index ef945eb46..20ddc5f5c 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -103,6 +103,7 @@ func TestPod(t *testing.T) { assertAppHealth(t, "./testdata/pod-error.yaml", HealthStatusDegraded) assertAppHealth(t, "./testdata/pod-running-restart-always.yaml", HealthStatusHealthy) assertAppHealth(t, "./testdata/pod-running-restart-never.yaml", HealthStatusProgressing) + assertAppHealth(t, "./testdata/pod-running-restart-never-with-ignore-annotation.yaml", HealthStatusHealthy) assertAppHealth(t, "./testdata/pod-running-restart-onfailure.yaml", HealthStatusProgressing) assertAppHealth(t, "./testdata/pod-failed.yaml", HealthStatusDegraded) assertAppHealth(t, "./testdata/pod-succeeded.yaml", HealthStatusHealthy) diff --git a/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml new file mode 100644 index 000000000..1cc27e5f1 --- /dev/null +++ b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml @@ -0,0 +1,87 @@ +apiVersion: v1 +kind: Pod +metadata: + creationTimestamp: 2018-12-02T09:15:16Z + name: my-pod + namespace: argocd + resourceVersion: "151053" + selfLink: /api/v1/namespaces/argocd/pods/my-pod + uid: c86e909c-f612-11e8-a057-fe5f49266390 + annotations: + argocd.argoproj.io/ignore-restart-policy: "true" +spec: + containers: + - command: + - sh + - -c + - sleep 10 + image: alpine:3.21 + imagePullPolicy: Always + name: main + resources: + requests: + ephemeral-storage: "100Mi" + memory: "128Mi" + cpu: "250m" + limits: + memory: "256Mi" + cpu: "500m" + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: default-token-f9jvj + readOnly: true + dnsPolicy: ClusterFirst + nodeName: minikube + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + automountServiceAccountToken: false + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: default-token-f9jvj + secret: + defaultMode: 420 + secretName: default-token-f9jvj +status: + conditions: + - lastProbeTime: null + lastTransitionTime: 2018-12-02T09:15:16Z + status: "True" + type: Initialized + - lastProbeTime: null + lastTransitionTime: 2018-12-02T09:15:19Z + status: "True" + type: Ready + - lastProbeTime: null + lastTransitionTime: 2018-12-02T09:15:16Z + status: "True" + type: PodScheduled + containerStatuses: + - containerID: containerd://adc73c2c0ae3f1fd9bf294abd834e740042ee375de680c0cfcdd90d863a73b8b + image: alpine:3.21 + imageID: docker.io/library/alpine@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c + lastState: {} + name: main + ready: true + restartCount: 0 + state: + running: + startedAt: 2018-12-02T09:15:19Z + hostIP: 192.168.64.41 + phase: Running + podIP: 172.17.0.9 + qosClass: BestEffort + startTime: 2018-12-02T09:15:16Z