Skip to content
Snippets Groups Projects
kubekorner_geosphere_prometheus_rules.yaml 26.00 KiB
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  creationTimestamp: null
  labels:
    app: kube-prometheus-stack
    release: prometheus-operator
  name: geosphere-prometheus-rules
spec:
  groups:
#  - name: ./example.rules
#    rules:
#    - alert: ExampleAlert
#      expr: vector(1)
#      labels:
#        severity: warning
#      annotations:
#        summary: "Example Alert"
#        description: "A test prometheus rule that always fires"

  # Most of the below rules taken from
  # https://awesome-prometheus-alerts.grep.to/rules.html
  - name: geosphere-nginx-ingress.rules
    rules:
      - alert: NginxHighHttp4xxErrorRate
        expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
        for: 5m
        labels:
          severity: critical
          ruleGroup: geosphere-nginx-ingress-controller
        annotations:
          summary: "Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})"
          description: "Too many HTTP requests with status 4xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: NginxHighHttp5xxErrorRate
        expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
        for: 5m
        labels:
          severity: critical
          ruleGroup: geosphere-nginx-ingress-controller
        annotations:
          summary: "Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})"
          description: "Too many HTTP requests with status 5xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: NginxLatencyHigh
        expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
        for: 5m
        labels:
          severity: warning
          ruleGroup: geosphere-nginx-ingress-controller
        annotations:
          summary: "Nginx latency high (instance {{ $labels.instance }})"
          description: "Nginx p99 latency is higher than 10 seconds\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"


  - name: geosphere-prometheus-checks.rules
    rules:
    - alert: PrometheusTooManyRestarts
      expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
      for: 5m
      labels:
        severity: warning
        ruleGroup: geosphere-prometheus
      annotations:
        summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
        description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: PrometheusAlertmanagerConfigurationReloadFailure
      expr: alertmanager_config_last_reload_successful != 1
      for: 5m
      labels:
        severity: warning
        ruleGroup: geosphere-prometheus