Prometheus监控container的告警规则

修改Prometheus.yml文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
rules.yml: |
groups:
- name: container.rules
rules:
- alert: Container_Memory_RSS
expr: ((sum(container_memory_rss{job="kubernetes-cadvisor",pod_name!=""}) by (pod_name)) /(sum(container_spec_memory_limit_bytes{job="kubernetes-cadvisor",pod_name!=""}) by (pod_name)) * 100) > 95
for: 2m
labels:
severity: critica
annotations:
summary: "检测内存使用率过高"
description: "{{ $labels.pod_name }}内存持续2分钟高于95%. 当前值: {{ $value }}"
- alert: Container_Network_RX_Average
expr: ((sum (rate (container_network_receive_bytes_total{job="kubernetes-nodes",pod_name!=""}[1m])) by (pod_name)) / 1024) > 102400
for: 2m
labels:
severity: critica
annotations:
summary: "检测网络带宽使用率过高."
description: "{{ $labels.pod_name }}网络带宽持续2分钟高于 100M. RX带宽使用率: {{ $value }}"
- alert: Container_Network_TX_Average
expr: ((sum (rate (container_network_transmit_bytes_total{job="kubernetes-nodes",pod_name!=""}[1m])) by (pod_name)) / 1024) > 102400
for: 2m
labels:
severity: critica
annotations:
summary: "检测网络带宽使用率过高."
description: "{{ $labels.pod_name }}网络带宽持续2分钟高于 100M. TX带宽使用率: {{ $value }}"
- alert: Container_USAGE_CPU_Average
expr: ((sum(rate(container_cpu_usage_seconds_total{job="kubernetes-nodes",image!="",pod_name!=""}[1m])) BY (pod_name)) * 100) > 95
for: 2m
labels:
severity: critica
annotations:
summary: "检测CPU平均使用率过高."
description: "{{ $labels.pod_name }}CPU持续2分钟高于高于95% 当前值: {{ $value }}"