alertrules.nodes.yml 2.23 KiB
groups:
- name: alertrules.nodes
  rules:
  - alert: high_cpu_usage_on_node
    expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) > 90
    for: 1m
    annotations:
      description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.'
      summary: HIGH CPU USAGE WARNING ON '{% raw %}{{ $labels.nodename }}{% endraw %}'
  - alert: high_memory_usage_on_node
    expr: sum by(nodename) (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) > 95
    for: 1m
    annotations:
      description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}%.'
      summary: HIGH MEMORY USAGE WARNING TASK ON '{% raw %}{{ $labels.nodename }}{% endraw %}'
  - alert: high_load_on_node
    expr: sum by(nodename) ((node_load1 / count without(cpu, mode) (node_cpu{mode="system"}))
      * on(instance) group_left(nodename) node_uname_info * 100) > 200
    for: 1m
    annotations:
      description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) has a high load average. Load average is {% raw %}{{ $value }}{% endraw %}%.'
      summary: HIGH LOAD AVERAGE WARNING ON '{% raw %}{{ $labels.nodename }}{% endraw %}'
  - alert: node_exporter_down
    expr: up == 0
    for: 1m
    annotations:
      description: The node exporter '{% raw %}{{ $labels.job }}{% endraw %}' is down.
      summary: 'NODE EXPORTER SERVICE CRITICAL: NODE ''{% raw %}{{ $labels.host }}{% endraw %}'''
  - alert: node_running_out_of_disk_space
    expr: sum by(nodename) ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
      * 100 / node_filesystem_size{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) > 80
    for: 1m
    annotations:
      description: More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}%
      summary: 'LOW DISK SPACE WARING: NODE ''{% raw %}{{ $labels.nodename }}{% endraw %}'' '