diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml index 1b4d3bdc41a914b10197b1307a2c18a23e5cc785..0a03893ba22796938d0f82b8b02592d691b09c60 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml @@ -7,24 +7,24 @@ groups: labels: severity: WARNING annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' + description: '{% raw %}{{ $labels.instance }}{% endraw %} is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' + summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.instance }}{% endraw %}' - alert: high_cpu_usage_on_node_CRITICAL expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_threshold_Critical }} and (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) < {{ node_cpu_usage_percentage_threshold_Fatal }} for: 1m labels: severity: CRITICAL annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' + description: '{% raw %}{{ $labels.instance }}{% endraw %} is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' + summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.instance }}{% endraw %}' - alert: high_cpu_usage_on_node_FATAL expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_threshold_Fatal }} for: 1m labels: severity: FATAL annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' + description: '{% raw %}{{ $labels.instance }}{% endraw %} is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' + summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.instance }}{% endraw %}' - alert: high_memory_usage_on_node_WARNING expr: sum by(nodename) ((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_threshold_Warning }} and (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) < {{ node_memory_usage_percentage_threshold_Critical }} ) for: 1m diff --git a/ansible/roles/stack-monitor/templates/alertrules.nodes.yml b/ansible/roles/stack-monitor/templates/alertrules.nodes.yml index f7dca91ee8d5a35c8fc98108c4918b69e3479209..4cf0069874933b8f85863fb4876597bfaf7010e6 100644 --- a/ansible/roles/stack-monitor/templates/alertrules.nodes.yml +++ b/ansible/roles/stack-monitor/templates/alertrules.nodes.yml @@ -7,24 +7,24 @@ groups: labels: severity: WARNING annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' + description: '{% raw %}{{ $labels.instance }}{% endraw %} is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' + summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.instance }}{% endraw %}' - alert: high_cpu_usage_on_node_CRITICAL expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_threshold_Critical }} and (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) < {{ node_cpu_usage_percentage_threshold_Fatal }} for: 1m labels: severity: CRITICAL annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' + description: '{% raw %}{{ $labels.instance }}{% endraw %} is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' + summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.instance }}{% endraw %}' - alert: high_cpu_usage_on_node_FATAL expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_threshold_Fatal }} for: 1m labels: severity: FATAL annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' + description: '{% raw %}{{ $labels.instance }}{% endraw %} is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' + summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.instance }}{% endraw %}' - alert: high_memory_usage_on_node_WARNING expr: sum by(nodename) ((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_threshold_Warning }} and (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) < {{ node_memory_usage_percentage_threshold_Critical }} ) for: 1m