aboutsummaryrefslogtreecommitdiff
path: root/rules/alerts.yml
blob: a4728550ced3083273faf8962c1b64db9b29b755 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# several alerts stolen from https://awesome-prometheus-alerts.grep.to/
# and here: https://gitlab.com/gitlab-com/runbooks/-/tree/master/rules
#
# PLEASE properly label and annotate your alerts. When in doubt refer
# to the rest of them!

groups:
- name: node.rules
  rules:
  - alert: HostHighCpuLoad
    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[10m])) * 100) > 85
    for: 5m
    labels:
      severity: info
    annotations:
      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[5m]) > 0
    for: 5m
    labels:
      severity: info
    annotations:
      description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: HostRaidArrayGotInactive
    expr: node_md_state{state="inactive"} > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: HostRaidDiskFailure
    expr: node_md_disks{state="fail"} > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: SmartUnhealthy
    expr: smartmon_device_smart_healthy != 1
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Disk {{ $labels.disk }} on host {{ $labels.instance }} is unhealthy according to SMART
  - alert: HostOutOfMemory
    expr: (node_zfs_arc_c_max or 0 + node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 < 10
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Host {{ $labels.instance }} is running out of memory
  - alert: DiskWillFillIn4Hours
    expr: predict_linear(node_filesystem_free_bytes{job="node",fstype!~"tmpfs"}[1h], 4 * 3600) < 0
    for: 5m
    labels:
      severity: critical
    annotations:
      description: "Host {{ $labels.instance }} disk {{ $labels.device }} will fill within 4 hours"
  - alert: ProcessNearFDLimits
    expr: process_open_fds / process_max_fds > 0.8
    for: 10m
    labels:
      severity: critical
    annotations:
      description: FDs for process {{ $labels.job }} on {{ $labels.instance}} is filling up
  - alert: HostUnusualNetworkThroughputIn
    expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: info
    annotations:
      description: Unusual inbound throughput on {{ $labels.instance }}
  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: info
    annotations:
      description: Unusual outbound throughput on {{ $labels.instance }}
  - alert: NoRecentPostgresBackup
    expr: time() - postgresql_backup_last_run_seconds > 3600 * 48
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Postgres backup script has not run on {{ $labels.instance }} for more than two days
  - alert: NoRecentPostgresOffsiteBackup
    expr: time() - postgresql_offsite_backup_last_run_seconds > 3600 * 48
    for: 5m
    labels:
      severity: critical
    annotations:
      description: Postgres backup script has not run on {{ $labels.instance }} for more than two days
- name: pg.rules
  rules:
  - alert: PostgresqlDown
    expr: pg_up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      description: PostgreSQL is down on {{ $labels.instance }}
  - alert: PostgresqlRestarted
    expr: time() - process_start_time_seconds{job="postgres"} < 60
    for: 1m
    labels:
      severity: info
    annotations:
      description: PostgreSQL was restarted on {{ $labels.instance }}
- name: prometheus.rules
  rules:
  - alert: PrometheusConfigurationReloadFailure
    expr: prometheus_config_last_reload_successful != 1
    for: 5m
    labels:
      severity: info
    annotations:
      description: Prometheus failed to reload. Fix the config!
- name: ssl_expiry.rules
  rules:
  - alert: SSLCertExpiringSoon
    expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 7 # 7 days
    for: 10m
    labels:
      severity: info
    annotations:
      description: The {{ $labels.instance }} certificate expires soon. Renew it pls!