An error occurred while loading the file. Please try again.
-
princegupta1131 authoredcd0c8b82
prometheus_reservation_memory: 1G
prometheus_limit_memory: 1G
prometheus_storage_retention_time: 90d
alertmanager_reservation_memory: 100M
alertmanager_limit_memory: 100M
node_exporter_reservation_memory: 16M
node_exporter_limit_memory: 32M
cadvisor_reservation_memory: 100M
cadvisor_limit_memory: 100M
elasticsearch_exporter_reservation_memory: 8M
elasticsearch_exporter_limit_memory: 24M
postgres_exporter_reservation_memory: 16M
postgres_exporter_limit_memory: 32M
statsd_exporter_reservation_memory: 8M
statsd_exporter_limit_memory: 16M
blackbox_exporter_reservation_memory: 16M
blackbox_exporter_limit_memory: 32M
jsonpath_exporter_reservation_memory: 32M
jsonpath_exporter_limit_memory: 64M
azure_blob_exporter_reservation_memory: 16M
azure_blob_exporter_limit_memory: 64M
grafana_reservation_memory: 100M
grafana_limit_memory: 100M
container_cpu_usage_percentage_theshold: 90
container_memory_usage_percentage_theshold: 90
# Override this at environment level
expected_minimum_logs_per_minute: 0
server_side_http_errors_threshold_percentage: 1
# Defaults to 1 day (24 hours) with 1 hour extra for backup process to complete
expected_data_backup_interval_in_minutes: "{{ 25 * 60 }}"
# Defaults to 1 day (24 hours) with 1 hour extra for snapshot process to complete
expected_elasticsearch_snapshot_interval_in_minutes: "{{ 25 * 60 }}"
# Defaults to 1KB. Good enough to catch errors mentioned in https://about.gitlab.com/2017/02/01/gitlab-dot-com-database-incident/
expected_data_backup_size_in_bytes: 1024
enable_postgres_availability_check: true
enable_scraping_docker_metrics: false
docker_metrics_port: "2377"
kong_cluster_expected_number_of_nodes: "{{ kong_replicas | default(1) }}"
monitor_stack_files_dest_dir: /opt/docker/stacks/stateful_monitor/stack
monitor_config_files_dest_dir: /opt/docker/stacks/stateful_monitor/config
docker_service_replicas_exporter_version: v0.0.3
docker_service_replicas_exporter_name: docker-service-replicas-exporter
es_port: 9200
monitor_config_templates:
- prometheus.yml
- alertmanagerconfig.yml
- blackboxconfig.yml
- statsd_mapping.yml
- elasticsearch_snapshots_exporter_config.yml
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
- data_backup_azure_blob_exporter_config.yml
- alertrules.nodes.yml
- alertrules.task.yml
- alertrules.es.yml
- alertrules.logs.yml
- alertrules.backups.yml
- alertrules.services.yml
- alertrules.postgresql.yml
- alertrules.process.yml
- alertrules.kafkalag.yml
- jmx_httpserver.yml
monitor_config_templates_postgres:
- postgresmasterqueries.yml
- postgresslavequeries.yml
devops_alerts_mailing_list: "{{ alerts_mailing_list }}"
devops_alerts_mailing_severity_filter: "{{ alerts_mailing_severity_filter }}"
service_teams:
- team: devops_team
alerts_mailing_list: "{{ devops_alerts_mailing_list }}"
services:
- monitor_.*
- monit
- logger_.*
- proxy_.*
severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}"
- team: app_team
alerts_mailing_list: "{{ app_alerts_mailing_list | default(devops_alerts_mailing_list) }}"
services:
- actor-service
- learner-service
- lms-service
- content-service
- player_player
- cassandra
- composite_search
- analytics-api
- tomcat
- logstash
- search
- neo4j
severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}"
- team: keycloak_team
alerts_mailing_list: "{{ keycloak_alerts_mailing_list | default(devops_alerts_mailing_list)}}"
services:
- keycloak
severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}"
- team: api_manager_team
alerts_mailing_list: "{{ api_manager_alerts_mailing_list | default(devops_alerts_mailing_list)}}"
services:
- api-manager_.*
- adminutil_.*
severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}"
- team: site_team
alerts_mailing_list: "{{ site_alerts_mailing_list | default(devops_alerts_mailing_list)}}"
services:
- sunbird_static_site
severity_mailing_filter: "{{ devops_alerts_mailing_severity_filter }}"
- team: druid
alerts_mailing_list: "{{ druid_alerts_mailing_list | default(devops_alerts_mailing_list)}}"
services:
- druidzookeeper
- druidpostgres
- overlord
- coordinator
- historical
- broker
- middlemanager
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
- druid
severity_mailing_filter: "{{ druid_alerts_mailing_severity_filter | d(devops_alerts_mailing_severity_filter) }}"
service_blackbox_checks:
- service_name: 'analytics-api'
probe_module: http_2xx
targets:
- "{{sunbird_analytics_api_base_url}}/health"
- service_name: 'learning-service'
probe_module: http_2xx
targets:
- "{{sunbird_content_repo_api_base_url}}/health"
- service_name: 'search-service'
probe_module: http_2xx
targets:
- "{{sunbird_search_service_api_base_url}}/health"
- service_name: 'monit'
probe_module: http_2xx
targets:
- "http://{{ groups['swarm-bootstrap-manager'][0] }}:2812"
- service_name: 'keycloak'
probe_module: http_2xx
targets:
- "{{proto}}://{{proxy_server_name}}/auth/realms/sunbird/protocol/openid-connect/auth?client_id=portal&state=foo&redirect_uri=https%3A%2F%2F{{proxy_server_name}}%2Fprivate%2Findex%3Fauth_callback%3D1&scope=openid&response_type=code"
prometheus_route_prefix: prometheus
prometheus_web_external_url: "{{proto}}://{{api__host}}:9090/{{ prometheus_route_prefix }}"
prometheus_alertmanager_route_prefix: alertmanager
prometheus_alertmanager_web_external_url: "{{proto}}://{{api__host}}:9093/{{ prometheus_alertmanager_route_prefix }}"
postgres_exporter_password:
postgres_exporter_postgres_port: 5432
postgres_exporter_user: postgres_exporter
root_group: root
root_owner: root
backup_storage_name: prometheus_backup
prometheus_stateful_mount_point: "/root/dockerdata/prometheus_stateful/data/"
docker_service_replicas_memory_limit: 512MB
#################################################### Monitoring limits ################################################
container_cpu_usage_percentage_threshold_Warning: 75
container_cpu_usage_percentage_threshold_Critical: 85
container_cpu_usage_percentage_threshold_Fatal: 95
container_memory_usage_percentage_threshold_Warning: 75
container_memory_usage_percentage_threshold_Critical: 85
container_memory_usage_percentage_threshold_Fatal: 95
node_cpu_usage_percentage_threshold_Warning: 75
node_cpu_usage_percentage_threshold_Critical: 85
node_cpu_usage_percentage_threshold_Fatal: 95
node_memory_usage_percentage_threshold_Warning: 75
node_memory_usage_percentage_threshold_Critical: 85
node_memory_usage_percentage_threshold_Fatal: 95
node_load_avg_threshold_Warning: 85
node_load_avg_threshold_Critical: 95
node_load_avg_threshold_Fatal: 120
node_disk_usage_percentage_threshold_Warning: 75
node_disk_usage_percentage_threshold_Critical: 85
node_disk_usage_percentage_threshold_Fatal: 95
postgres_number_of_connections_Warning: 100
postgres_number_of_connections_Critical: 110
postgres_number_of_connections_Fatal: 130
211212213214215216217218219220221222223224225226227228229
elasticsearch_filesystem_data_remaining_threshold_Warning: 25
elasticsearch_filesystem_data_remaining_threshold_Critical: 15
elasticsearch_filesystem_data_remaining_threshold_Fatal: 10
### secor consumer lag threshold
events_deviceprofile_backup_threshold: 1000
telemetry_channel_backup_threshold: 10000
telemetry_derived_backup_threshold: 10000
telemetry_failed_backup_threshold: 10000
telemetry_ingestion_backup_threshold: 1000
graph_events_backup_threshold: 500
telemetry_raw_backup_threshold: 10000
telemetry_unique_backup_threshold: 10000
learning_failed_events_backup_threshold: 500
telemetry_denorm_backup_threshold: 10000
pipeline_metrics_threshold: 500
telemetry_extractor_failed_threshold: 1000
telemetry_assess_threshold: 1000