cloudnative-pg · fpfuetsch · Apr 11, 2025 · Apr 18, 2025
@@ -17,7 +17,7 @@ annotations:
     case you may want to silence it.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
 expr: |
-  max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 1
+  max by (job) (cnpg_pg_replication_streaming_replicas{job="{{ .job }}"} - cnpg_pg_replication_is_wal_receiver_up{job="{{ .job }}"}) < 1
 for: 5m
 labels:
   severity: critical

@@ -15,7 +15,7 @@ annotations:
     In this case you may want to silence it.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
 expr: |
-  max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 2
+  max by (job) (cnpg_pg_replication_streaming_replicas{job="{{ .job }}"} - cnpg_pg_replication_is_wal_receiver_up{job="{{ .job }}"}) < 2
 for: 5m
 labels:
   severity: warning

@@ -4,11 +4,11 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance maximum number of connections critical!
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
+    CloudNativePG Cluster "{{ .job }}" instance {{ .labels.pod }} is using {{ .value }}% of
     the maximum number of connections.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
 expr: |
-  sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95
+  sum by (pod) (cnpg_backends_total{job="{{ .job }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", job="{{ .job }}", pod=~"{{ .podSelector }}"}) * 100 > 95
 for: 5m
 labels:
   severity: critical

@@ -4,11 +4,11 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance is approaching the maximum number of connections.
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
+    CloudNativePG Cluster "{{ .job }}" instance {{ .labels.pod }} is using {{ .value }}% of
     the maximum number of connections.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
 expr: |
-  sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80
+  sum by (pod) (cnpg_backends_total{job="{{ .job }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", job="{{ .job }}", pod=~"{{ .podSelector }}"}) * 100 > 80
 for: 5m
 labels:
   severity: warning

@@ -4,13 +4,13 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Cluster high replication lag
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is experiencing a high replication lag of
+    CloudNativePG Cluster "{{ .job }}" is experiencing a high replication lag of
     {{ .value }}ms.
 
     High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
 expr: |
-  max(cnpg_pg_replication_lag{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000
+  max(cnpg_pg_replication_lag{job="{{ .job }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000
 for: 5m
 labels:
   severity: warning

@@ -4,13 +4,13 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Cluster instances are located on the same node.
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has {{ .value }}
+    CloudNativePG Cluster "{{ .job }}" has {{ .value }}
     instances on the same node {{ .labels.node }}.
 
     A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
 expr: |
-  count by (node) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1
+  count by (node) (kube_pod_info{job="{{ .job }}", pod=~"{{ .podSelector }}"}) > 1
 for: 5m
 labels:
   severity: warning

@@ -4,14 +4,14 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance is running out of disk space!
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs!
+    CloudNativePG Cluster "{{ .job }}" is running extremely low on disk space. Check attached PVCs!
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
 expr: |
-  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR
-  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.9 OR
-  max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.9 OR
+  max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
       /
-      sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+      sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
       *
       on(namespace, persistentvolumeclaim) group_left(volume)
       kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ .podSelector }}"}

@@ -4,14 +4,14 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance is running out of disk space.
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs.
+    CloudNativePG Cluster "{{ .job }}" is running low on disk space. Check attached PVCs.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
 expr: |
-  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR
-  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.7 OR
-  max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.7 OR
+  max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
       /
-      sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+      sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{job="{{ .job }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
       *
       on(namespace, persistentvolumeclaim) group_left(volume)
       kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ .podSelector }}"}

@@ -4,13 +4,13 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Cluster has no running instances!
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has no ready instances.
+    CloudNativePG Cluster "{{ .job }}" has no ready instances.
 
     Having an offline cluster means your applications will not be able to access the database, leading to
     potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md
 expr: |
-  (count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR on() vector(0)) == 0
+  (count(cnpg_collector_up{job="{{ .job }}",pod=~"{{ .podSelector }}"}) OR on() vector(0)) == 0
 for: 5m
 labels:
   severity: critical

@@ -4,12 +4,12 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Cluster instances in the same zone.
   description: |-
-    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has instances in the same availability zone.
+    CloudNativePG Cluster "{{ .job }}" has instances in the same availability zone.
 
     A disaster in one availability zone will lead to a potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
 expr: |
-  {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
+  {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{job="{{ .job }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
 for: 5m
 labels:
   severity: warning

@@ -17,6 +17,7 @@ spec:
         {{- $_ := set $dict "value"       "{{ $value }}" -}}
         {{- $_ := set $dict "namespace"   .Release.Namespace -}}
         {{- $_ := set $dict "cluster"     (include "cluster.fullname" .) -}}
+        {{- $_ := set $dict "job"         (printf "%s/%s" .Release.Namespace (include "cluster.fullname" .)) -}}
         {{- $_ := set $dict "labels"      (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}}
         {{- $_ := set $dict "podSelector" (printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .)) -}}
         {{- $_ := set $dict "Values"      .Values -}}