diff --git a/ansible/roles/stack-monitor-stateful/defaults/main.yml b/ansible/roles/stack-monitor-stateful/defaults/main.yml index 9e897fd4a4c987bbb3839344e5cbb6fc38f30b58..c3c09f80e44331a176400bf2c2270dda0a70b2f1 100644 --- a/ansible/roles/stack-monitor-stateful/defaults/main.yml +++ b/ansible/roles/stack-monitor-stateful/defaults/main.yml @@ -169,36 +169,3 @@ root_owner: root backup_storage_name: prometheus_backup prometheus_stateful_mount_point: "/root/dockerdata/prometheus_stateful/data/" docker_service_replicas_memory_limit: 512MB - -#################################################### Monitoring limits ################################################ -container_cpu_usage_percentage_theshold_Warning: 70 -container_cpu_usage_percentage_theshold_Critial: 85 -container_cpu_usage_percentage_theshold_Fatal: 95 - -container_memory_usage_percentage_theshold_Warning: 70 -container_memory_usage_percentage_theshold_Critical: 85 -container_memory_usage_percentage_theshold_Fatal: 95 - -node_cpu_usage_percentage_theshold_Warning: 70 -node_cpu_usage_percentage_theshold_Critial: 85 -node_cpu_usage_percentage_theshold_Fatal: 95 - -node_memory_usage_percentage_theshold_Warning: 70 -node_memory_usage_percentage_theshold_Critical: 85 -node_memory_usage_percentage_theshold_Fatal: 95 - -node_load_avg_theshold_Warning: 85 -node_load_avg_theshold_Critial: 95 -node_load_avg__theshold_Fatal: 120 - -node_disk_usage_percentage_theshold_Warning: 70 -node_disk_usage_percentage_theshold_Critial: 85 -node_disk_usage_percentage_theshold_Fatal: 95 - -postgres_number_of_connections_Warning: 100 -postgres_number_of_connections_Critical: 110 -postgres_number_of_connections_Fatal: 130 - -elasticsearch_filesystem_data_remaining_theshold_Warning: 30 -elasticsearch_filesystem_data_remaining_theshold_Critical: 20 -elasticsearch_filesystem_data_remaining_theshold_Fatal: 10 diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.backups.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.backups.yml index ee3260de2e93a79dd71a0142994b89ee85ad1184..3f302fb6cfa272a21b671d7f5aeb95c600494bf7 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.backups.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.backups.yml @@ -1,19 +1,15 @@ groups: - name: alertrules.backups rules: - - alert: backup_is_too_old_CRITICAL + - alert: backup_is_too_old expr: time() - azure_blob_latest_file_timestamp{job="data-backup-azure-blob-exporter"} / 1000 > {{ expected_data_backup_interval_in_minutes|int * 60 }} for: 5m - labels: - severity: CRITICAL annotations: description: '{% raw %}{{ $labels.container }}{% endraw %}: Latest backup file was created {% raw %}{{ humanizeDuration $value }}{% endraw %} ago. Threshold: {{ expected_data_backup_interval_in_minutes }} minutes' summary: Backup is too old - - alert: backup_size_is_too_small_CRITICAL + - alert: backup_size_is_too_small expr: azure_blob_latest_file_size{job="data-backup-azure-blob-exporter"} < {{ expected_data_backup_size_in_bytes }} for: 5m - labels: - severity: CRITICAL annotations: description: '{% raw %}{{ $labels.container }}{% endraw %}: Latest backup file is {% raw %}{{ $value }}{% endraw %} bytes, smaller than the threshold {{ expected_data_backup_size_in_bytes }} bytes' summary: Backup size is too small diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.docker.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.docker.yml index e222e7c07d6b3fc50d3559a303fb07a2bd16f46d..f3ff6b89ffad2f2958ed64b3804c89e3dd1b2717 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.docker.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.docker.yml @@ -1,11 +1,11 @@ groups: - name: alertrules.docker rules: - - alert: docker_swarm_node_down_FATAL + - alert: docker_swarm_node_down expr: swarm_manager_nodes{state="down"} > 0 for: 1m labels: - severity: FATAL + severity: critical annotations: description: 'Number nodes down : {% raw %}{{$value}}{% endraw %}' summary: 'Docker swarm node down' diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.es.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.es.yml index 6da75bf0e0bebb83f6e2ed20fa3c1d522ed389d8..05ced8a09c5ef7458488cb492b12cf2e7758f484 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.es.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.es.yml @@ -3,35 +3,13 @@ groups: rules: - record: elasticsearch_filesystem_data_used_percent expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes - - alert: elasticsearch_filesystem_data_free_percent_WARNING - expr: 100 - elasticsearch_filesystem_data_used_percent >= {{ elasticsearch_filesystem_data_remaining_theshold_Warning }} - for: 1m - labels: - severity: WARNING - annotations: - description: Elasticsearch Free space on Disk {% raw %}{{$value}}{% endraw %} - summary: Elasticsearch has less free disk space {% raw %}{{$value}}{% endraw %} - - alert: elasticsearch_filesystem_data_free_percent_CRITICAL - expr: 100 - elasticsearch_filesystem_data_used_percent >= {{ elasticsearch_filesystem_data_remaining_theshold_Critical }} - for: 1m - labels: - severity: CRITICAL - annotations: - description: Elasticsearch Free space on Disk {% raw %}{{$value}}{% endraw %} - summary: Elasticsearch has less free disk space {% raw %}{{$value}}{% endraw %} - - alert: elasticsearch_filesystem_data_free_percent_FATAL - expr: 100 - elasticsearch_filesystem_data_used_percent >= {{ elasticsearch_filesystem_data_remaining_theshold_Fatal }} - for: 1m - labels: - severity: FATAL - annotations: - description: Elasticsearch Free space on Disk {% raw %}{{$value}}{% endraw %} - summary: Elasticsearch has less free disk space {% raw %}{{$value}}{% endraw %} + - record: elasticsearch_filesystem_data_free_percent + expr: 100 - elasticsearch_filesystem_data_used_percent - alert: elasticsearch_too_few_nodes_running expr: elasticsearch_cluster_health_number_of_nodes{job="elasticsearch-exporter"} < {{ groups['es'] | length }} for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: There are only {% raw %}{{$value}}{% endraw %} < {{ groups['es'] | length }} ElasticSearch nodes running summary: ElasticSearch running on less than {{ groups['es'] | length }} nodes @@ -40,7 +18,7 @@ groups: expr: elasticsearch_cluster_health_number_of_nodes{job="log-elasticsearch-exporter"} < {{ groups['log-es'] | length }} for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: There are only {% raw %}{{$value}}{% endraw %} < {{ groups['log-es'] | length }} ElasticSearch nodes running summary: ElasticSearch running on less than {{ groups['log-es'] | length }} nodes @@ -49,7 +27,7 @@ groups: expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9 for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: The heap usage is over 90% for 15m summary: ElasticSearch node {% raw %}{{$labels.node}}{% endraw %} heap usage is high @@ -57,7 +35,7 @@ groups: expr: time() - elasticsearch_snapshots_latest_successful_snapshot_timestamp{job="elasticsearch-snapshots-exporter"} / 1000 > {{ expected_elasticsearch_snapshot_interval_in_minutes|int * 60 }} for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: Elasticsearch snapshot is too old summary: Latest elasticSearch snapshot was taken {% raw %}{{ humanizeDuration $value }}{% endraw %} ago. Threshold is {{ expected_elasticsearch_snapshot_interval_in_minutes }} minutes diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.logs.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.logs.yml index 367c881ee3f5c0f9e9f9f155536e801e24cfe006..c577015a57bffdd0ca04507a3b8e77092274a53a 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.logs.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.logs.yml @@ -1,11 +1,11 @@ groups: - name: alertrules.logs rules: - - alert: logs_ingestion_slow_CRITICAL + - alert: logs_ingestion_slow expr: increase(elasticsearch_indices_docs{job="log-elasticsearch-exporter"}[5m]) / 5 < {{ expected_minimum_logs_per_minute }} for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: '{% raw %}{{ $labels.job }}{% endraw %}: Logs per minute is {% raw %}{{ $value }}{% endraw %}. It is below the threshold: {{ expected_minimum_logs_per_minute }}' summary: Logs are not flowing as expected diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml index 40179e7983b0928e1378a39835c69876bdd4a4a1..bc6db6774fa5efe8a864ae56b812dacdb867cbeb 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.nodes.yml @@ -1,107 +1,35 @@ groups: - name: alertrules.nodes rules: - - alert: high_cpu_usage_on_node_WARNING - expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_theshold_Warning }} and (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) < {{ node_cpu_usage_percentage_theshold_Critial }} + - alert: high_cpu_usage_on_node + expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) > 90 for: 1m - labels: - severity: WARNING annotations: description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_cpu_usage_on_node_CRITICAL - expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_theshold_Critial }} and (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) < {{ node_cpu_usage_percentage_theshold_Fatal }} + summary: HIGH CPU USAGE WARNING ON '{% raw %}{{ $labels.nodename }}{% endraw %}' + - alert: high_memory_usage_on_node + expr: sum by(nodename) (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) > 95 for: 1m - labels: - severity: CRITICAL annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_cpu_usage_on_node_FATAL - expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_theshold_Fatal }} + description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}%.' + summary: HIGH MEMORY USAGE WARNING TASK ON '{% raw %}{{ $labels.nodename }}{% endraw %}' + - alert: high_load_on_node + expr: sum by(nodename) ((node_load1 / count without(cpu, mode) (node_cpu{mode="system"})) + * on(instance) group_left(nodename) node_uname_info * 100) > 200 for: 1m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_memory_usage_on_node_WARNING - expr: sum by(nodename) ((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_theshold_Warning }} and (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) < {{ node_memory_usage_percentage_theshold_Critical }} ) - for: 1m - labels: - severity: WARNING - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}.' - summary: 'HIGH MEMORY USAGE WARNING TASK ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_memory_usage_on_node_CRITICAL - expr: sum by(nodename) ((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_theshold_Critical }} and (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) < {{ node_memory_usage_percentage_theshold_Fatal }} ) - for: 1m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}.' - summary: 'HIGH MEMORY USAGE WARNING TASK ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_memory_usage_on_node_FATAL - expr: sum by(nodename) (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}.' - summary: 'HIGH MEMORY USAGE WARNING TASK ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_load_on_node_WARNING - expr: sum by(nodename) (((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) >= 85 and ((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) < 95 ) - for: 5m - labels: - severity: WARNING annotations: description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) has a high load average. Load average is {% raw %}{{ $value }}{% endraw %}%.' - summary: 'HIGH LOAD AVERAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_load_on_node_CRITICAL - expr: sum by(nodename) (((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_load_avg_theshold_Warning }} and ((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) < {{ node_load_avg_theshold_Critial }} ) - for: 5m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) has a high load average. Load average is {% raw %}{{ $value }}{% endraw %}%.' - summary: 'HIGH LOAD AVERAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_load_on_node_FATAL - expr: sum by(nodename) ((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) > {{ node_load_avg_theshold_Fatal }} - for: 5m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) has a high load average. Load average is {% raw %}{{ $value }}{% endraw %}%.' - summary: 'HIGH LOAD AVERAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: node_exporter_down_CRITICAL + summary: HIGH LOAD AVERAGE WARNING ON '{% raw %}{{ $labels.nodename }}{% endraw %}' + - alert: node_exporter_down expr: up == 0 for: 1m - labels: - severity: CRITICAL annotations: description: The node exporter '{% raw %}{{ $labels.job }}{% endraw %}' is down. summary: 'NODE EXPORTER SERVICE CRITICAL: NODE ''{% raw %}{{ $labels.host }}{% endraw %}''' - - alert: node_running_out_of_disk_space_WARNING - expr: sum by(nodename) (((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ node_disk_usage_percentage_theshold_Warning }} and ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) < {{ node_disk_usage_percentage_theshold_Critial }} ) - for: 1m - labels: - severity: WARNING - annotations: - description: 'More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}%' - summary: 'LOW DISK SPACE WARING: NODE {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: node_running_out_of_disk_space_WARNING - expr: sum by(nodename) (((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ node_disk_usage_percentage_theshold_Critial }} and ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) < {{ node_disk_usage_percentage_theshold_Fatal }} ) - for: 1m - labels: - severity: CRITICAL - annotations: - description: 'More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}%' - summary: 'LOW DISK SPACE WARING: NODE {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: node_running_out_of_disk_space_FATAL - expr: sum by(nodename) ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ node_disk_usage_percentage_theshold_Fatal }} + - alert: node_running_out_of_disk_space + expr: sum by(nodename) ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) + * 100 / node_filesystem_size{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) > 80 for: 1m - labels: - severity: FATAL annotations: - description: 'More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}%' - summary: 'LOW DISK SPACE WARING: NODE {% raw %}{{ $labels.nodename }}{% endraw %}' + description: More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}% + summary: 'LOW DISK SPACE WARING: NODE ''{% raw %}{{ $labels.nodename }}{% endraw %}'' ' diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.postgresql.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.postgresql.yml index 2714545510f44ef713882c929adb787518d46af0..1a32502385e7ca24ae8488ea5ddc4cf0203a1653 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.postgresql.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.postgresql.yml @@ -6,32 +6,16 @@ groups: expr: pg_exporter_last_scrape_error == 1 for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: 'PostgreSQL unavailable as per job: {% raw %}{{$labels.job}}{% endraw %}' summary: PostgreSQL unavailable {% endif %} - - alert: postgres_high_number_of_connections_WARNING - expr: sum(pg_stat_database_numbackends) > {{ postgres_number_of_connections_Warning }} + - alert: postgres_high_number_of_connections + expr: sum(pg_stat_database_numbackends) > 90 for: 1m labels: - severity: WARNING - annotations: - description: 'Number of connections is above the high water mark: {% raw %}{{$value}}{% endraw %}' - summary: PostgreSQL high number of connections - - alert: postgres_high_number_of_connections_CRITICAL - expr: sum(pg_stat_database_numbackends) > {{ postgres_number_of_connections_Critical }} - for: 1m - labels: - severity: CRITICAL - annotations: - description: 'Number of connections is above the high water mark: {% raw %}{{$value}}{% endraw %}' - summary: PostgreSQL high number of connections - - alert: postgres_high_number_of_connections_FATAL - expr: sum(pg_stat_database_numbackends) > {{ postgres_number_of_connections_Fatal }} - for: 1m - labels: - severity: FATAL + severity: critical annotations: description: 'Number of connections is above the high water mark: {% raw %}{{$value}}{% endraw %}' summary: PostgreSQL high number of connections @@ -39,7 +23,7 @@ groups: expr: pg_server_standby_status_in_recovery{job="master-postgres-exporter"} == 1 for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: 'PostgreSQL master is in recovery. pg_server_standby_status_in_recovery: {% raw %}{{$value}}{% endraw %}' summary: PostgreSQL master is in recovery @@ -47,15 +31,15 @@ groups: expr: pg_server_standby_status_in_recovery{job="slave-postgres-exporter"} == 0 for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: 'PostgreSQL slave is not in recovery. pg_server_standby_status_in_recovery: {% raw %}{{$value}}{% endraw %}' summary: PostgreSQL slave is not in recovery - - alert: postgres_high_replication_byte_lag + - alert: postgres_high_peplication_byte_lag expr: pg_stat_replication_byte_lag > 1e+06 for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: 'The replication byte lag for salve: {% raw %}{{$labels.slave_addr}}{% endraw %} is above the high water mark: {% raw %}{{$value}}{% endraw %}' summary: PostgreSQL replication byte lag is high @@ -63,7 +47,7 @@ groups: expr: pg_replication_lag > 60 for: 1m labels: - severity: CRITICAL + severity: critical annotations: description: 'The replication lag between the master and slave is above the high water mark: {% raw %}{{$value}}{% endraw %}' diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.process.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.process.yml index 98a73686b1e52c5325b64ffcd9d74e6cc096930b..e1127d06c9b5aa9ca8d1d65d1ce6f69976acd443 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.process.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.process.yml @@ -4,152 +4,120 @@ groups: - alert: tomcat_process_not_running expr: namedprocess_namegroup_states{groupname="tomcat",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: tomcat process is not running - alert: search_process_not_running expr: namedprocess_namegroup_states{groupname="search",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: search process is not running - alert: neo4j_process_not_running expr: namedprocess_namegroup_states{groupname="neo4j",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: neo4j process is not running - alert: kafka_process_not_running expr: namedprocess_namegroup_states{groupname="kafka",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Kafka process is not running - alert: kafka_more_than_one_process_running expr: namedprocess_namegroup_num_procs{groupname="kafka"} > 1 for: 1m - labels: - severity: CRITICAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: More than one process running + - alert: search_process_not_running + expr: namedprocess_namegroup_states{groupname="search",state="Sleeping"} < 1 + for: 1m + annotations: + description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' + summary: search process is not running - alert: secor_process_not_running expr: namedprocess_namegroup_states{groupname="secor",state="Sleeping"} != 9 for: 1m - labels: - severity: CRITICAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Secor process is not running - alert: zookeeper_process_not_running expr: namedprocess_namegroup_states{groupname="zookeeper",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Zookeeper process is not running - alert: yarn_process_not_running expr: namedprocess_namegroup_states{groupname="yarn",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: YARN process is not running - alert: cassandra_process_not_running expr: namedprocess_namegroup_states{groupname="cassandra",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Cassandra process is not running - alert: elasticsearch_process_not_running expr: namedprocess_namegroup_states{groupname="elasticsearch",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Elasticsearch process is not running - alert: logstash_process_not_running expr: namedprocess_namegroup_states{groupname="logstash",state="Sleeping"} < 1 for: 1m - labels: - severity: CRITICAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Logstash process is not running - alert: Analytics_api_process_not_running expr: namedprocess_namegroup_states{groupname="analyticsapi",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Analytics API process is not running - alert: druid_zookeeper_process_not_running expr: namedprocess_namegroup_states{groupname="druidzookeeper",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Druid zookeeper is not running - alert: druid_postgres_process_not_running expr: namedprocess_namegroup_states{groupname="druidpostgres",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Druid postgres is not running - alert: druid_overlord_process_not_running expr: namedprocess_namegroup_states{groupname="overlord",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Druid overlord process is not running - alert: druid_coordinator_process_not_running expr: namedprocess_namegroup_states{groupname="coordinator",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Druid coordinator process is not running - alert: druid_historical_process_not_running expr: namedprocess_namegroup_states{groupname="historical",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Druid historical process is not running - alert: druid_broker_process_not_running expr: namedprocess_namegroup_states{groupname="broker",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Druid broker process is not running - alert: druid_middleManager_process_not_running expr: namedprocess_namegroup_states{groupname="middleManager",state="Sleeping"} < 1 for: 1m - labels: - severity: FATAL annotations: description: 'Number of running processes are: {% raw %}{{$value}}{% endraw %}' summary: Druid middleManager process is not running diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.services.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.services.yml index 037e5f252dd6275db854bf9948e6db19da0316b0..1f3952a0c49c4c06b38b3c386a1c26f7e563a3c7 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.services.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.services.yml @@ -1,19 +1,15 @@ groups: - name: alertrules.services rules: - - alert: service_down_FATAL + - alert: service_down expr: probe_success == 0 for: 1m - labels: - severity: FATAL annotations: description: '{% raw %}{{ $labels.job }}{% endraw %}: The service is down.' summary: Service down - alert: health_check expr: changes(probe_success[5m]) > 2 for: 2m - labels: - severity: CRITICAL annotations: description: 'The service status has changed {% raw %}{{$value}}{% endraw %} times in last 2 minutes. Threshold is : 2' summary: Health check is failing diff --git a/ansible/roles/stack-monitor-stateful/templates/alertrules.task.yml b/ansible/roles/stack-monitor-stateful/templates/alertrules.task.yml index 7aa87cf50523b680414317c81396eb44ee6d38fd..974c8cfcd0a073519379c77d7aa2d0cb9eadd596 100644 --- a/ansible/roles/stack-monitor-stateful/templates/alertrules.task.yml +++ b/ansible/roles/stack-monitor-stateful/templates/alertrules.task.yml @@ -1,59 +1,24 @@ groups: - name: alertrules.task rules: - - alert: high_cpu_usage_on_container_WARNING - expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 >= {{ container_cpu_usage_percentage_theshold_Warning }} and sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 < {{ container_cpu_usage_percentage_theshold_Critial }} + - alert: high_cpu_usage_on_container + expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 > {{ container_cpu_usage_percentage_theshold }} for: 1m - labels: - severity: WARNING annotations: description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% CPU. Threshold is : {{ container_cpu_usage_percentage_theshold }}%' summary: 'HIGH CPU USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_cpu_usage_on_container_CRITICAL - expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 >= {{ container_cpu_usage_percentage_theshold_Critical }} and sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 < {{ container_cpu_usage_percentage_theshold_Fatal }} + + - alert: high_memory_usage_on_container + expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 > {{ container_memory_usage_percentage_theshold }} < Inf for: 1m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% CPU. Threshold is : {{ container_cpu_usage_percentage_theshold }}%' - summary: 'HIGH CPU USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_cpu_usage_on_container_FATAL - expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 >= {{ container_cpu_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% CPU. Threshold is : {{ container_cpu_usage_percentage_theshold }}%' - summary: 'HIGH CPU USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_memory_usage_on_container_WARNING - expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 >= {{ container_memory_usage_percentage_theshold_Warning }} and (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 < {{ container_memory_usage_percentage_theshold_Critical }} - for: 1m - labels: - severity: WARNING - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% memory. Threshold is : {{ container_memory_usage_percentage_theshold }} %' - summary: 'HIGH MEMORY USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_memory_usage_on_container_CRITICAL - expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 >= {{ container_memory_usage_percentage_theshold_Critical }} and (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 < {{ container_memory_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% memory. Threshold is : {{ container_memory_usage_percentage_theshold }} %' - summary: 'HIGH MEMORY USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_memory_usage_on_container_FATAL - expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 >= {{ container_memory_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: FATAL annotations: description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% memory. Threshold is : {{ container_memory_usage_percentage_theshold }} %' summary: 'HIGH MEMORY USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: replicas_uneven_FATAL - expr: sum by (service_name) (docker_service_replicas_expected != docker_service_replicas_running) + + - alert: replicas_uneven + expr: sum by (service_name) (docker_service_replicas_expected != docker_service_replicas_running) for: 1m - labels: - severity: FATAL annotations: description: 'UNEVEN REPLICAS COUNT FOR {% raw %}{{ $labels.service_name }}{% endraw %}' summary: 'UNEVEN REPLICAS COUNT: {% raw %}{{ $labels.service_name }}{% endraw %} is having uneven count' + diff --git a/ansible/roles/stack-monitor/defaults/main.yml b/ansible/roles/stack-monitor/defaults/main.yml index a28d31caa4e8a4233abbd4da8e4031b8faac41c6..66b33712cccc36e966f3f3c65dec0108f8e8b855 100644 --- a/ansible/roles/stack-monitor/defaults/main.yml +++ b/ansible/roles/stack-monitor/defaults/main.yml @@ -170,36 +170,3 @@ backup_storage_name: prometheus_backup docker_service_replicas_memory_limit: 256MB prometheus_mount_point: "/root/dockerdata/prometheus/data/" - -#################################################### Monitoring limits ################################################ -container_cpu_usage_percentage_theshold_Warning: 70 -container_cpu_usage_percentage_theshold_Critial: 85 -container_cpu_usage_percentage_theshold_Fatal: 95 - -container_memory_usage_percentage_theshold_Warning: 70 -container_memory_usage_percentage_theshold_Critical: 85 -container_memory_usage_percentage_theshold_Fatal: 95 - -node_cpu_usage_percentage_theshold_Warning: 70 -node_cpu_usage_percentage_theshold_Critial: 85 -node_cpu_usage_percentage_theshold_Fatal: 95 - -node_memory_usage_percentage_theshold_Warning: 70 -node_memory_usage_percentage_theshold_Critical: 85 -node_memory_usage_percentage_theshold_Fatal: 95 - -node_load_avg_theshold_Warning: 85 -node_load_avg_theshold_Critial: 95 -node_load_avg__theshold_Fatal: 120 - -node_disk_usage_percentage_theshold_Warning: 70 -node_disk_usage_percentage_theshold_Critial: 85 -node_disk_usage_percentage_theshold_Fatal: 95 - -postgres_number_of_connections_Warning: 100 -postgres_number_of_connections_Critical: 110 -postgres_number_of_connections_Fatal: 130 - -elasticsearch_filesystem_data_remaining_theshold_Warning: 30 -elasticsearch_filesystem_data_remaining_theshold_Critical: 20 -elasticsearch_filesystem_data_remaining_theshold_Fatal: 10 diff --git a/ansible/roles/stack-monitor/templates/alertrules.docker.yml b/ansible/roles/stack-monitor/templates/alertrules.docker.yml index e222e7c07d6b3fc50d3559a303fb07a2bd16f46d..f3ff6b89ffad2f2958ed64b3804c89e3dd1b2717 100644 --- a/ansible/roles/stack-monitor/templates/alertrules.docker.yml +++ b/ansible/roles/stack-monitor/templates/alertrules.docker.yml @@ -1,11 +1,11 @@ groups: - name: alertrules.docker rules: - - alert: docker_swarm_node_down_FATAL + - alert: docker_swarm_node_down expr: swarm_manager_nodes{state="down"} > 0 for: 1m labels: - severity: FATAL + severity: critical annotations: description: 'Number nodes down : {% raw %}{{$value}}{% endraw %}' summary: 'Docker swarm node down' diff --git a/ansible/roles/stack-monitor/templates/alertrules.kong.yml b/ansible/roles/stack-monitor/templates/alertrules.kong.yml index 9122ba64ca9bb2e211acd2013bfc8a1299b341a1..2b3bd5b819d4534858f7d0a00aa236cad5ed61b8 100644 --- a/ansible/roles/stack-monitor/templates/alertrules.kong.yml +++ b/ansible/roles/stack-monitor/templates/alertrules.kong.yml @@ -1,11 +1,11 @@ groups: - name: alertrules.kong rules: - - alert: kong_cluster_unhealthy_FATAL + - alert: kong_cluster_unhealthy expr: kong_cluster_alive_nodes != {{ kong_cluster_expected_number_of_nodes }} for: 1m labels: - severity: FATAL + severity: critical annotations: description: 'Number of live nodes : {% raw %}{{$value}}{% endraw %} not equal to : {{ kong_cluster_expected_number_of_nodes }}' summary: 'Kong cluster is unhealthy' diff --git a/ansible/roles/stack-monitor/templates/alertrules.nodes.yml b/ansible/roles/stack-monitor/templates/alertrules.nodes.yml index 890fe123a04a0e09339a40b2c30ae633fbea376f..e321c9edbc7e3f7021a6007534ceb7325959a2e2 100644 --- a/ansible/roles/stack-monitor/templates/alertrules.nodes.yml +++ b/ansible/roles/stack-monitor/templates/alertrules.nodes.yml @@ -1,107 +1,34 @@ groups: - name: alertrules.nodes rules: - - alert: high_cpu_usage_on_node_WARNING - expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_theshold_Warning }} and (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) < {{ node_cpu_usage_percentage_theshold_Critical }} + - alert: high_cpu_usage_on_node + expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) > 90 for: 1m - labels: - severity: WARNING annotations: description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_cpu_usage_on_node_CRITICAL - expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_theshold_Critical }} and (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) < {{ node_cpu_usage_percentage_theshold_Fatal }} + - alert: high_memory_usage_on_node + expr: sum by(nodename) (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) > 90 for: 1m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_cpu_usage_on_node_FATAL - expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="vm-node-exporter",mode="idle"}[5m])) * 100) >= {{ node_cpu_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of CPU. CPU usage is {% raw %}{{ humanize $value}}{% endraw %}%.' - summary: 'HIGH CPU USAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_memory_usage_on_node_WARNING - expr: sum by(nodename) ((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_theshold_Warning }} and (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) < {{ node_memory_usage_percentage_theshold_Critical }} ) - for: 1m - labels: - severity: WARNING annotations: description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}.' summary: 'HIGH MEMORY USAGE WARNING TASK ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_memory_usage_on_node_CRITICAL - expr: sum by(nodename) ((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_theshold_Critical }} and (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) < {{ node_memory_usage_percentage_theshold_Fatal }} ) + - alert: high_load_on_node + expr: sum by(nodename) ((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"})) + * on(instance) group_left(nodename) node_uname_info * 100) > 100 for: 1m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}.' - summary: 'HIGH MEMORY USAGE WARNING TASK ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_memory_usage_on_node_FATAL - expr: sum by(nodename) (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_memory_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) is using a LOT of MEMORY. MEMORY usage is over {% raw %}{{ humanize $value}}{% endraw %}.' - summary: 'HIGH MEMORY USAGE WARNING TASK ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_load_on_node_WARNING - expr: sum by(nodename) (((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_load_avg_theshold_Warning }} and ((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) < {{ node_load_avg_theshold_Critial }} ) - for: 5m - labels: - severity: WARNING annotations: description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) has a high load average. Load average is {% raw %}{{ $value }}{% endraw %}%.' summary: 'HIGH LOAD AVERAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_load_on_node_CRITICAL - expr: sum by(nodename) (((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) >= {{ node_load_avg_theshold_Critial }} and ((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) < {{ node_load_avg_theshold_Fatal }} ) - for: 5m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) has a high load average. Load average is {% raw %}{{ $value }}{% endraw %}%.' - summary: 'HIGH LOAD AVERAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: high_load_on_node_FATAL - expr: sum by(nodename) ((node_load1 / count without(cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) node_uname_info * 100) > {{ node_load_avg_theshold_Fatal }} - for: 5m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.nodename }}{% endraw %} ({% raw %}{{ $labels.host }}{% endraw %}) has a high load average. Load average is {% raw %}{{ $value }}{% endraw %}%.' - summary: 'HIGH LOAD AVERAGE WARNING ON {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: monitoring_service_down_FATAL + - alert: monitoring_service_down expr: up == 0 for: 1m - labels: - severity: FATAL annotations: description: 'The monitoring service {% raw %}{{ $labels.job }}{% endraw %} is down.' summary: 'MONITORING SERVICE DOWN WARNING: NODE {% raw %}{{ $labels.host }}{% endraw %}' - - alert: node_running_out_of_disk_space_WARNING - expr: sum by(nodename) (((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ node_disk_usage_percentage_theshold_Warning }} and ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) < {{ node_disk_usage_percentage_theshold_Critial }} ) - for: 1m - labels: - severity: WARNING - annotations: - description: 'More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}%' - summary: 'LOW DISK SPACE WARING: NODE {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: node_running_out_of_disk_space_WARNING - expr: sum by(nodename) (((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ node_disk_usage_percentage_theshold_Critial }} and ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) < {{ node_disk_usage_percentage_theshold_Fatal }} ) - for: 1m - labels: - severity: CRITICAL - annotations: - description: 'More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}%' - summary: 'LOW DISK SPACE WARING: NODE {% raw %}{{ $labels.nodename }}{% endraw %}' - - alert: node_running_out_of_disk_space_FATAL - expr: sum by(nodename) ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ node_disk_usage_percentage_theshold_Fatal }} + - alert: node_running_out_of_disk_space + expr: sum by(nodename) ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) > 80 for: 1m - labels: - severity: FATAL annotations: description: 'More than 80% of disk used. Disk usage is {% raw %}{{ humanize $value }}{% endraw %}%' summary: 'LOW DISK SPACE WARING: NODE {% raw %}{{ $labels.nodename }}{% endraw %}' diff --git a/ansible/roles/stack-monitor/templates/alertrules.task.yml b/ansible/roles/stack-monitor/templates/alertrules.task.yml index 34dbef54b1d931081850134c021ee6529bf2a259..f30b3e13555a4a8b4087224ff001fe6e6b5fff1f 100644 --- a/ansible/roles/stack-monitor/templates/alertrules.task.yml +++ b/ansible/roles/stack-monitor/templates/alertrules.task.yml @@ -1,59 +1,25 @@ groups: - name: alertrules.task rules: - - alert: high_cpu_usage_on_container_WARNING - expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 >= {{ container_cpu_usage_percentage_theshold_Warning }} and sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 < {{ container_cpu_usage_percentage_theshold_Critial }} + - alert: high_cpu_usage_on_container + expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, + instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) + * 100 > {{ container_cpu_usage_percentage_theshold }} for: 1m - labels: - severity: WARNING annotations: description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% CPU. Threshold is : {{ container_cpu_usage_percentage_theshold }}%' summary: 'HIGH CPU USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_cpu_usage_on_container_CRITICAL - expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 >= {{ container_cpu_usage_percentage_theshold_Critical }} and sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 < {{ container_cpu_usage_percentage_theshold_Fatal }} + + - alert: high_memory_usage_on_container + expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 > {{ container_memory_usage_percentage_theshold }} < Inf for: 1m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% CPU. Threshold is : {{ container_cpu_usage_percentage_theshold }}%' - summary: 'HIGH CPU USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_cpu_usage_on_container_FATAL - expr: sum by(container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_name, instance) (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) * 100 >= {{ container_cpu_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: FATAL - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% CPU. Threshold is : {{ container_cpu_usage_percentage_theshold }}%' - summary: 'HIGH CPU USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_memory_usage_on_container_WARNING - expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 >= {{ container_memory_usage_percentage_theshold_Warning }} and (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 < {{ container_memory_usage_percentage_theshold_Critical }} - for: 1m - labels: - severity: WARNING - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% memory. Threshold is : {{ container_memory_usage_percentage_theshold }} %' - summary: 'HIGH MEMORY USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_memory_usage_on_container_CRITICAL - expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 >= {{ container_memory_usage_percentage_theshold_Critical }} and (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 < {{ container_memory_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: CRITICAL - annotations: - description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% memory. Threshold is : {{ container_memory_usage_percentage_theshold }} %' - summary: 'HIGH MEMORY USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: high_memory_usage_on_container_FATAL - expr: (container_memory_usage_bytes{container_label_com_docker_swarm_task_name=~".+"} / container_spec_memory_limit_bytes) * 100 >= {{ container_memory_usage_percentage_theshold_Fatal }} - for: 1m - labels: - severity: FATAL annotations: description: '{% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} is using {% raw %}{{ $value }}{% endraw %}% memory. Threshold is : {{ container_memory_usage_percentage_theshold }} %' summary: 'HIGH MEMORY USAGE WARNING: TASK {% raw %}{{ $labels.container_label_com_docker_swarm_task_name }}{% endraw %} on {% raw %}{{ $labels.instance }}{% endraw %}' - - alert: replicas_uneven_FATAL + + - alert: replicas_uneven expr: sum by (service_name) (docker_service_replicas_expected != docker_service_replicas_running) for: 1m - labels: - severity: FATAL annotations: description: 'UNEVEN REPLICAS COUNT FOR {% raw %}{{ $labels.service_name }}{% endraw %}' summary: 'UNEVEN REPLICAS COUNT: {% raw %}{{ $labels.service_name }}{% endraw %} is having uneven count'