Skip to content
Snippets Groups Projects
Verified Commit e27997ac authored by David Hoese's avatar David Hoese
Browse files

Update prometheus rules to not trigger so often on kubekorner

parent 38bf3f59
No related branches found
Tags r20201006_1316
No related merge requests found
...@@ -211,7 +211,7 @@ spec: ...@@ -211,7 +211,7 @@ spec:
summary: "Host unusual network throughput out (instance {{ $labels.instance }})" summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualDiskReadRate - alert: HostUnusualDiskReadRate
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 expr: sum by (instance) (irate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 50
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
...@@ -220,7 +220,7 @@ spec: ...@@ -220,7 +220,7 @@ spec:
summary: "Host unusual disk read rate (instance {{ $labels.instance }})" summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualDiskWriteRate - alert: HostUnusualDiskWriteRate
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 expr: sum by (instance) (irate(node_disk_written_bytes_total[5m])) / 1024 / 1024 > 50
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
...@@ -461,15 +461,15 @@ spec: ...@@ -461,15 +461,15 @@ spec:
annotations: annotations:
summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})" summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})"
description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: KubernetesCronjobTooLong # - alert: KubernetesCronjobTooLong
expr: time() - kube_cronjob_next_schedule_time > 3600 # expr: time() - kube_cronjob_next_schedule_time > 3600
for: 5m # for: 5m
labels: # labels:
severity: warning # severity: warning
ruleGroup: geosphere-kubernetes # ruleGroup: geosphere-kubernetes
annotations: # annotations:
summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})" # summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})"
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" # description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: KubernetesJobCompletion - alert: KubernetesJobCompletion
expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0 expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0
......
...@@ -5,16 +5,18 @@ kubeControllerManager: ...@@ -5,16 +5,18 @@ kubeControllerManager:
- "128.104.110.154" - "128.104.110.154"
kubeEtcd: kubeEtcd:
endpoints: enabled: false
- "128.104.110.154" # endpoints:
# - "128.104.110.154"
kubeScheduler: kubeScheduler:
endpoints: endpoints:
- "128.104.110.154" - "128.104.110.154"
kubeProxy: kubeProxy:
endpoints: enabled: false
- "128.104.110.154" # endpoints:
# - "128.104.110.154"
alertmanager: alertmanager:
## Alertmanager configuration directives ## Alertmanager configuration directives
...@@ -43,9 +45,8 @@ alertmanager: ...@@ -43,9 +45,8 @@ alertmanager:
slack_configs: slack_configs:
- channel: "#geo2grid" - channel: "#geo2grid"
send_resolved: true send_resolved: true
icon_emoji: '{{ if eq .Status "firing" }}:fearful:{{ else }}:excellent:{{ end }}'
color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}'
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}' title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}'
text: |- text: |-
{{ range .Alerts }} {{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
......
...@@ -13,7 +13,7 @@ cache: ...@@ -13,7 +13,7 @@ cache:
cleanup: cleanup:
# every 6 hours # every 6 hours
schedule: "0 */6 * * *" schedule: "0 */6 * * *"
age: "+7" age: "+2"
seed: seed:
images: true images: true
overlays: true overlays: true
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment