From 6f128f7a9576f78f22e87d37c9fe0f26496b7a7c Mon Sep 17 00:00:00 2001 From: David Hoese <david.hoese@ssec.wisc.edu> Date: Wed, 25 Nov 2020 14:34:55 -0600 Subject: [PATCH] Remove KubernetesPodNotHealthy alert because it has false positives --- .../kubekorner_geosphere_prometheus_rules.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/admin/kubekorner_geosphere_prometheus_rules.yaml b/admin/kubekorner_geosphere_prometheus_rules.yaml index 345d749..abafad6 100644 --- a/admin/kubekorner_geosphere_prometheus_rules.yaml +++ b/admin/kubekorner_geosphere_prometheus_rules.yaml @@ -434,15 +434,15 @@ spec: summary: "Kubernetes StatefulSet down (instance {{ $labels.instance }})" description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - alert: KubernetesPodNotHealthy - expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:5m]) > 0 - for: 5m - labels: - severity: critical - ruleGroup: geosphere-kubernetes - annotations: - summary: "Kubernetes Pod not healthy (instance {{ $labels.instance }})" - description: "Pod has been in a non-ready state for longer than an hour.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" +# - alert: KubernetesPodNotHealthy +# expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:5m]) > 0 +# for: 5m +# labels: +# severity: critical +# ruleGroup: geosphere-kubernetes +# annotations: +# summary: "Kubernetes Pod not healthy (instance {{ $labels.instance }})" +# description: "Pod has been in a non-ready state for longer than an hour.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: KubernetesPodCrashLooping expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5 for: 5m -- GitLab