Something went wrong on our end
-
David Hoese authored
kubekorner_geosphere_prometheus_rules.yaml 26.00 KiB
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
app: kube-prometheus-stack
release: prometheus-operator
name: geosphere-prometheus-rules
spec:
groups:
# - name: ./example.rules
# rules:
# - alert: ExampleAlert
# expr: vector(1)
# labels:
# severity: warning
# annotations:
# summary: "Example Alert"
# description: "A test prometheus rule that always fires"
# Most of the below rules taken from
# https://awesome-prometheus-alerts.grep.to/rules.html
- name: geosphere-nginx-ingress.rules
rules:
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
ruleGroup: geosphere-nginx-ingress-controller
annotations:
summary: "Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})"
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
ruleGroup: geosphere-nginx-ingress-controller
annotations:
summary: "Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})"
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: NginxLatencyHigh
expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
for: 5m
labels:
severity: warning
ruleGroup: geosphere-nginx-ingress-controller
annotations:
summary: "Nginx latency high (instance {{ $labels.instance }})"
description: "Nginx p99 latency is higher than 10 seconds\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: geosphere-prometheus-checks.rules
rules:
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
severity: warning
ruleGroup: geosphere-prometheus
annotations:
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
for: 5m
labels:
severity: warning
ruleGroup: geosphere-prometheus