main.yml 7.66 KiB
prometheus_reservation_memory: 1G
prometheus_limit_memory: 1G
prometheus_storage_retention_time: 90d
alertmanager_reservation_memory: 100M
alertmanager_limit_memory: 100M
node_exporter_reservation_memory: 16M
node_exporter_limit_memory: 32M
cadvisor_reservation_memory: 100M
cadvisor_limit_memory: 100M
elasticsearch_exporter_reservation_memory: 8M
elasticsearch_exporter_limit_memory: 24M
postgres_exporter_reservation_memory: 16M
postgres_exporter_limit_memory: 32M
statsd_exporter_reservation_memory: 8M
statsd_exporter_limit_memory: 16M
blackbox_exporter_reservation_memory: 16M
blackbox_exporter_limit_memory: 32M
jsonpath_exporter_reservation_memory: 32M
jsonpath_exporter_limit_memory: 64M
azure_blob_exporter_reservation_memory: 16M
azure_blob_exporter_limit_memory: 64M
grafana_reservation_memory: 100M
grafana_limit_memory: 100M
container_cpu_usage_percentage_theshold: 90
container_memory_usage_percentage_theshold: 90
# Override this at environment level
expected_minimum_logs_per_minute: 0
server_side_http_errors_threshold_percentage: 1
# Defaults to 1 day (24 hours) with 1 hour extra for backup process to complete
expected_data_backup_interval_in_minutes: "{{ 25 * 60 }}"
# Defaults to 1 day (24 hours) with 1 hour extra for snapshot process to complete
expected_elasticsearch_snapshot_interval_in_minutes: "{{ 25 * 60 }}"
# Defaults to 1KB. Good enough to catch errors mentioned in https://about.gitlab.com/2017/02/01/gitlab-dot-com-database-incident/
expected_data_backup_size_in_bytes: 1024
enable_postgres_availability_check: true
enable_scraping_docker_metrics: false
docker_metrics_port: "2377"
kong_cluster_expected_number_of_nodes: "{{ kong_replicas | default(1) }}"
monitor_stack_files_dest_dir: /opt/docker/stacks/stateful_monitor/stack
monitor_config_files_dest_dir: /opt/docker/stacks/stateful_monitor/config
docker_service_replicas_exporter_version: v0.0.3
docker_service_replicas_exporter_name: docker-service-replicas-exporter
es_port: 9200
monitor_config_templates:
  - prometheus.yml
  - alertmanagerconfig.yml
  - blackboxconfig.yml
  - statsd_mapping.yml
  - elasticsearch_snapshots_exporter_config.yml
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
- data_backup_azure_blob_exporter_config.yml - alertrules.nodes.yml - alertrules.task.yml - alertrules.es.yml - alertrules.logs.yml - alertrules.backups.yml - alertrules.services.yml - alertrules.postgresql.yml - alertrules.process.yml - alertrules.kafkalag.yml - jmx_httpserver.yml monitor_config_templates_postgres: - postgresmasterqueries.yml - postgresslavequeries.yml devops_alerts_mailing_list: "{{ alerts_mailing_list }}" devops_alerts_mailing_severity_filter: "{{ alerts_mailing_severity_filter }}" service_teams: - team: devops_team alerts_mailing_list: "{{ devops_alerts_mailing_list }}" services: - monitor_.* - monit - logger_.* - proxy_.* severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}" - team: app_team alerts_mailing_list: "{{ app_alerts_mailing_list | default(devops_alerts_mailing_list) }}" services: - actor-service - learner-service - lms-service - content-service - player_player - cassandra - composite_search - analytics-api - tomcat - logstash - search - neo4j severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}" - team: keycloak_team alerts_mailing_list: "{{ keycloak_alerts_mailing_list | default(devops_alerts_mailing_list)}}" services: - keycloak severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}" - team: api_manager_team alerts_mailing_list: "{{ api_manager_alerts_mailing_list | default(devops_alerts_mailing_list)}}" services: - api-manager_.* - adminutil_.* severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}" - team: site_team alerts_mailing_list: "{{ site_alerts_mailing_list | default(devops_alerts_mailing_list)}}" services: - sunbird_static_site severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}" - team: druid alerts_mailing_list: "{{ druid_alerts_mailing_list | default(devops_alerts_mailing_list)}}" services: - druidzookeeper - druidpostgres - overlord - coordinator - historical - broker - middlemanager
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
- druid severity_mailing_filter: "{{ druid_alerts_mailing_severity_filter | d(devops_alerts_mailing_severity_filter) }}" service_blackbox_checks: - service_name: 'analytics-api' probe_module: http_2xx targets: - "{{sunbird_analytics_api_base_url}}/health" - service_name: 'learning-service' probe_module: http_2xx targets: - "{{sunbird_content_repo_api_base_url}}/health" - service_name: 'search-service' probe_module: http_2xx targets: - "{{sunbird_search_service_api_base_url}}/health" - service_name: 'monit' probe_module: http_2xx targets: - "http://{{ groups['swarm-bootstrap-manager'][0] }}:2812" - service_name: 'keycloak' probe_module: http_2xx targets: - "{{proto}}://{{proxy_server_name}}/auth/realms/sunbird/protocol/openid-connect/auth?client_id=portal&state=foo&redirect_uri=https%3A%2F%2F{{proxy_server_name}}%2Fprivate%2Findex%3Fauth_callback%3D1&scope=openid&response_type=code" prometheus_route_prefix: prometheus prometheus_web_external_url: "{{proto}}://{{api__host}}:9090/{{ prometheus_route_prefix }}" prometheus_alertmanager_route_prefix: alertmanager prometheus_alertmanager_web_external_url: "{{proto}}://{{api__host}}:9093/{{ prometheus_alertmanager_route_prefix }}" postgres_exporter_password: postgres_exporter_postgres_port: 5432 postgres_exporter_user: postgres_exporter root_group: root root_owner: root backup_storage_name: prometheus_backup prometheus_stateful_mount_point: "/root/dockerdata/prometheus_stateful/data/" docker_service_replicas_memory_limit: 512MB #################################################### Monitoring limits ################################################ container_cpu_usage_percentage_threshold_Warning: 75 container_cpu_usage_percentage_threshold_Critical: 85 container_cpu_usage_percentage_threshold_Fatal: 95 container_memory_usage_percentage_threshold_Warning: 75 container_memory_usage_percentage_threshold_Critical: 85 container_memory_usage_percentage_threshold_Fatal: 95 node_cpu_usage_percentage_threshold_Warning: 75 node_cpu_usage_percentage_threshold_Critical: 85 node_cpu_usage_percentage_threshold_Fatal: 95 node_memory_usage_percentage_threshold_Warning: 75 node_memory_usage_percentage_threshold_Critical: 85 node_memory_usage_percentage_threshold_Fatal: 95 node_load_avg_threshold_Warning: 85 node_load_avg_threshold_Critical: 95 node_load_avg_threshold_Fatal: 120 node_disk_usage_percentage_threshold_Warning: 75 node_disk_usage_percentage_threshold_Critical: 85 node_disk_usage_percentage_threshold_Fatal: 95 postgres_number_of_connections_Warning: 100 postgres_number_of_connections_Critical: 110 postgres_number_of_connections_Fatal: 130
211212213214215216217218219220221222223224225226227228229
elasticsearch_filesystem_data_remaining_threshold_Warning: 25 elasticsearch_filesystem_data_remaining_threshold_Critical: 15 elasticsearch_filesystem_data_remaining_threshold_Fatal: 10 ### secor consumer lag threshold events_deviceprofile_backup_threshold: 1000 telemetry_channel_backup_threshold: 10000 telemetry_derived_backup_threshold: 10000 telemetry_failed_backup_threshold: 10000 telemetry_ingestion_backup_threshold: 1000 graph_events_backup_threshold: 500 telemetry_raw_backup_threshold: 10000 telemetry_unique_backup_threshold: 10000 learning_failed_events_backup_threshold: 500 telemetry_denorm_backup_threshold: 10000 pipeline_metrics_threshold: 500 telemetry_extractor_failed_threshold: 1000 telemetry_assess_threshold: 1000