alertrules.es.yml 2.15 KiB
groups:
- name: alertrules.es
  rules:
  - record: elasticsearch_filesystem_data_used_percent
    expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes
  - record: elasticsearch_filesystem_data_free_percent
    expr: 100 - elasticsearch_filesystem_data_used_percent
  - alert: elasticsearch_too_few_nodes_running
    expr: elasticsearch_cluster_health_number_of_nodes{job="elasticsearch-exporter"} < {{ groups['es'] | length }}
    for: 1m
    labels:
      severity: critical
    annotations:
      description: There are only {% raw %}{{$value}}{% endraw %} < {{ groups['es'] | length }} ElasticSearch nodes running
      summary: ElasticSearch running on less than {{ groups['es'] | length }} nodes
{% if groups['log-es'] is defined %}
  - alert: elasticsearch_too_few_nodes_running
    expr: elasticsearch_cluster_health_number_of_nodes{job="log-elasticsearch-exporter"} < {{ groups['log-es'] | length }}
    for: 1m
    labels:
      severity: critical
    annotations:
      description: There are only {% raw %}{{$value}}{% endraw %} < {{ groups['log-es'] | length }} ElasticSearch nodes running
      summary: ElasticSearch running on less than {{ groups['log-es'] | length }} nodes
{% endif %}
  - alert: elasticsearch_heap_too_high
    expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.9
    for: 1m
    labels:
      severity: critical
    annotations:
      description: The heap usage is over 90% for 15m
      summary: ElasticSearch node {% raw %}{{$labels.node}}{% endraw %} heap usage is high
  - alert: elasticsearch_snapshot_is_too_old
    expr: time() - elasticsearch_snapshots_latest_successful_snapshot_timestamp{job="elasticsearch-snapshots-exporter"} / 1000 > {{ expected_elasticsearch_snapshot_interval_in_minutes|int * 60 }}
    for: 1m
    labels:
      severity: critical
    annotations:
      description: Elasticsearch snapshot is too old
      summary: Latest elasticSearch snapshot was taken {% raw %}{{ humanizeDuration $value }}{% endraw %} ago. Threshold is {{ expected_elasticsearch_snapshot_interval_in_minutes }} minutes