diff --git a/admin/kubekorner_geosphere_prometheus_rules.yaml b/admin/kubekorner_geosphere_prometheus_rules.yaml index ce4c97bb7d373d6fd1d9f1df6f5e02fd4519d3f7..54c17819d3c9428e905051a646216c881c31c9d7 100644 --- a/admin/kubekorner_geosphere_prometheus_rules.yaml +++ b/admin/kubekorner_geosphere_prometheus_rules.yaml @@ -211,7 +211,7 @@ spec: summary: "Host unusual network throughput out (instance {{ $labels.instance }})" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskReadRate - expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 + expr: sum by (instance) (irate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning @@ -220,7 +220,7 @@ spec: summary: "Host unusual disk read rate (instance {{ $labels.instance }})" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskWriteRate - expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + expr: sum by (instance) (irate(node_disk_written_bytes_total[5m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning @@ -461,15 +461,15 @@ spec: annotations: summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})" description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - alert: KubernetesCronjobTooLong - expr: time() - kube_cronjob_next_schedule_time > 3600 - for: 5m - labels: - severity: warning - ruleGroup: geosphere-kubernetes - annotations: - summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})" - description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" +# - alert: KubernetesCronjobTooLong +# expr: time() - kube_cronjob_next_schedule_time > 3600 +# for: 5m +# labels: +# severity: warning +# ruleGroup: geosphere-kubernetes +# annotations: +# summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})" +# description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: KubernetesJobCompletion expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0 diff --git a/admin/prometheus_kubernetes_values.yaml b/admin/prometheus_kubernetes_values.yaml index 913d5fd42bf9142a94c13d10acdf0ffa27a5f833..9b338a3f017e8d3713477f33ecb7c855a3ba8223 100644 --- a/admin/prometheus_kubernetes_values.yaml +++ b/admin/prometheus_kubernetes_values.yaml @@ -5,16 +5,18 @@ kubeControllerManager: - "128.104.110.154" kubeEtcd: - endpoints: - - "128.104.110.154" + enabled: false +# endpoints: +# - "128.104.110.154" kubeScheduler: endpoints: - "128.104.110.154" kubeProxy: - endpoints: - - "128.104.110.154" + enabled: false +# endpoints: +# - "128.104.110.154" alertmanager: ## Alertmanager configuration directives @@ -43,9 +45,8 @@ alertmanager: slack_configs: - channel: "#geo2grid" send_resolved: true - icon_emoji: '{{ if eq .Status "firing" }}:fearful:{{ else }}:excellent:{{ end }}' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' - title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}' + title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}' text: |- {{ range .Alerts }} *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` diff --git a/ci_geosphere/values-mapcache.yaml b/ci_geosphere/values-mapcache.yaml index 70a00e26b0dad74aa55e783d56406c89b3e244b7..fd4b1cc8ae98b300141f8493665147924925081a 100644 --- a/ci_geosphere/values-mapcache.yaml +++ b/ci_geosphere/values-mapcache.yaml @@ -13,7 +13,7 @@ cache: cleanup: # every 6 hours schedule: "0 */6 * * *" - age: "+7" + age: "+2" seed: images: true overlays: true