aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--rules/alerts.yml5
1 files changed, 4 insertions, 1 deletions
diff --git a/rules/alerts.yml b/rules/alerts.yml
index 0a34552..42c4217 100644
--- a/rules/alerts.yml
+++ b/rules/alerts.yml
@@ -13,7 +13,8 @@ groups:
labels:
severity: info
annotations:
- description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+ description: "CPU load is > 85%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+ clickthroughQuery: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\",{{ range $key, $value := $labels }}{{ $key }}=\"{{ $value }}\",{{ end }} }[10m])) * 100)"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 5m
@@ -21,6 +22,7 @@ groups:
severity: info
annotations:
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+ clickthroughQuery: "increase(node_vmstat_oom_kill{ {{ range $key, $value := $labels }}{{ $key }}=\"{{ $value }}\",{{ end }} }[5m])"
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 5m
@@ -28,6 +30,7 @@ groups:
severity: critical
annotations:
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+ clickthroughQuery: "node_md_state{state=\"inactive\",}"
- alert: HostRaidDiskFailure
expr: node_md_disks{state="fail"} > 0
for: 5m