Sample Prometheus config yaml files to scrap the metrics.
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'king-demo-monitor'
rule_files:
- "swarm_node.rules.yml"
- "swarm_task.rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'cadvisor'
dns_sd_configs:
- names:
- 'tasks.cadvisor'
type: 'A'
port: 8080
- job_name: 'node-exporter'
dns_sd_configs:
- names:
- 'tasks.node-exporter'
type: 'A'
port: 9100
Sample Alerting rules for the Docker Swarm Cluster Nodes and Container.
Alert rule for Nodes
groups:
- name: swarm_node.rules.yml
rules:
- alert: node_load_usage
expr: node_load5 > 5
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} load usage is at {{ humanize
$value}}%.
summary: load alert for Swarm node '{{ $labels.node_name }}'
- alert: node_memory_usage
expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
$value}}%.
summary: Memory alert for Swarm node '{{ $labels.node_name }}'
- alert: node_disk_usage
expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
node_meta > 85
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
$value}}%.
summary: Disk alert for Swarm node '{{ $labels.node_name }}'
- alert: node_disk_fill_rate_6h
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
GROUP_LEFT(node_name) node_meta < 0
for: 1h
labels:
severity: critical
annotations:
description: Swarm node {{ $labels.node_name }} disk is going to fill up in
6h.
summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'
Alert rule for Containers
groups:
- name: swarm_task.rules.yml
rules:
- alert: task_high_cpu_usage_70
expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
* 100 > 70
for: 1m
annotations:
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
$labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize
$value}}%.'
summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
- alert: task_high_memory_usage_1g
expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09
for: 1m
annotations:
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
$labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize
$value}}.'
summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
Comments