From e27997acedbbfc849eec61bd3285e5fc99ae0466 Mon Sep 17 00:00:00 2001 From: David Hoese <david.hoese@ssec.wisc.edu> Date: Tue, 6 Oct 2020 08:16:03 -0500 Subject: [PATCH] Update prometheus rules to not trigger so often on kubekorner --- ...kubekorner_geosphere_prometheus_rules.yaml | 22 +++++++++---------- admin/prometheus_kubernetes_values.yaml | 13 ++++++----- ci_geosphere/values-mapcache.yaml | 2 +- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/admin/kubekorner_geosphere_prometheus_rules.yaml b/admin/kubekorner_geosphere_prometheus_rules.yaml index ce4c97b..54c1781 100644 --- a/admin/kubekorner_geosphere_prometheus_rules.yaml +++ b/admin/kubekorner_geosphere_prometheus_rules.yaml @@ -211,7 +211,7 @@ spec: summary: "Host unusual network throughput out (instance {{ $labels.instance }})" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskReadRate - expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 + expr: sum by (instance) (irate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning @@ -220,7 +220,7 @@ spec: summary: "Host unusual disk read rate (instance {{ $labels.instance }})" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskWriteRate - expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + expr: sum by (instance) (irate(node_disk_written_bytes_total[5m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning @@ -461,15 +461,15 @@ spec: annotations: summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})" description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - alert: KubernetesCronjobTooLong - expr: time() - kube_cronjob_next_schedule_time > 3600 - for: 5m - labels: - severity: warning - ruleGroup: geosphere-kubernetes - annotations: - summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})" - description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" +# - alert: KubernetesCronjobTooLong +# expr: time() - kube_cronjob_next_schedule_time > 3600 +# for: 5m +# labels: +# severity: warning +# ruleGroup: geosphere-kubernetes +# annotations: +# summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})" +# description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: KubernetesJobCompletion expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0 diff --git a/admin/prometheus_kubernetes_values.yaml b/admin/prometheus_kubernetes_values.yaml index 913d5fd..9b338a3 100644 --- a/admin/prometheus_kubernetes_values.yaml +++ b/admin/prometheus_kubernetes_values.yaml @@ -5,16 +5,18 @@ kubeControllerManager: - "128.104.110.154" kubeEtcd: - endpoints: - - "128.104.110.154" + enabled: false +# endpoints: +# - "128.104.110.154" kubeScheduler: endpoints: - "128.104.110.154" kubeProxy: - endpoints: - - "128.104.110.154" + enabled: false +# endpoints: +# - "128.104.110.154" alertmanager: ## Alertmanager configuration directives @@ -43,9 +45,8 @@ alertmanager: slack_configs: - channel: "#geo2grid" send_resolved: true - icon_emoji: '{{ if eq .Status "firing" }}:fearful:{{ else }}:excellent:{{ end }}' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' - title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}' + title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}' text: |- {{ range .Alerts }} *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` diff --git a/ci_geosphere/values-mapcache.yaml b/ci_geosphere/values-mapcache.yaml index 70a00e2..fd4b1cc 100644 --- a/ci_geosphere/values-mapcache.yaml +++ b/ci_geosphere/values-mapcache.yaml @@ -13,7 +13,7 @@ cache: cleanup: # every 6 hours schedule: "0 */6 * * *" - age: "+7" + age: "+2" seed: images: true overlays: true -- GitLab