11groups :
22- name : cortex-cinder-alerts
33 rules :
4- - alert : CortexCinderInitialPlacementDown
4+ - alert : CortexCinderSchedulingDown
55 expr : |
6- up{component= "cortex-cinder-scheduler", namespace="cortex-cinder "} != 1 or
7- absent(up{component= "cortex-cinder-scheduler", namespace="cortex-cinder "})
6+ up{pod=~ "cortex-cinder-scheduling-.* "} != 1 or
7+ absent(up{pod=~ "cortex-cinder-scheduling-.* "})
88 for : 5m
99 labels :
1010 context : liveness
@@ -14,8 +14,102 @@ groups:
1414 support_group : workload-management
1515 playbook : docs/support/playbook/cortex/down
1616 annotations :
17- summary : " Cortex initial placement for Cinder is down"
17+ summary : " Cortex Scheduling for Cinder is down"
1818 description : >
19- The Cortex initial placement is down. Initial placement requests from Cinder will
19+ The Cortex scheduling service is down. Scheduling requests from Cinder will
2020 not be served. This is no immediate problem, since Cinder will continue
21- placing new volumes. However, the placement will be less desirable.
21+ placing new VMs. However, the placement will be less desirable.
22+ - alert : CortexCinderKnowledgeDown
23+ expr : |
24+ up{pod=~"cortex-cinder-knowledge-.*"} != 1 or
25+ absent(up{pod=~"cortex-cinder-knowledge-.*"})
26+ for : 5m
27+ labels :
28+ context : liveness
29+ dashboard : cortex/cortex
30+ service : cortex
31+ severity : warning
32+ support_group : workload-management
33+ playbook : docs/support/playbook/cortex/down
34+ annotations :
35+ summary : " Cortex Knowledge for Cinder is down"
36+ description : >
37+ The Cortex Knowledge service is down. This is no immediate problem,
38+ since cortex is still able to process requests,
39+ but the quality of the responses may be affected.
40+ - alert : CortexCinderHttpRequest400sTooHigh
41+ expr : rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1
42+ for : 5m
43+ labels :
44+ context : api
45+ dashboard : cortex/cortex
46+ service : cortex
47+ severity : warning
48+ support_group : workload-management
49+ annotations :
50+ summary : " Cinder Scheduler HTTP request 400 errors too high"
51+ description : >
52+ Cinder Scheduler is responding to placement requests with HTTP 4xx
53+ errors. This is expected when the scheduling request cannot be served
54+ by Cortex. However, it could also indicate that the request format has
55+ changed and Cortex is unable to parse it.
56+ - alert : CortexCinderSchedulingHttpRequest500sTooHigh
57+ expr : rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1
58+ for : 5m
59+ labels :
60+ context : api
61+ dashboard : cortex/cortex
62+ service : cortex
63+ severity : warning
64+ support_group : workload-management
65+ annotations :
66+ summary : " Cinder Scheduler HTTP request 500 errors too high"
67+ description : >
68+ Cinder Scheduler is responding to placement requests with HTTP 5xx errors.
69+ This is not expected and indicates that Cortex is having some internal problem.
70+ Cinder will continue to place new VMs, but the placement will be less desirable.
71+ Thus, no immediate action is needed.
72+ - alert : CortexCinderHighMemoryUsage
73+ expr : process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024
74+ for : 5m
75+ labels :
76+ context : memory
77+ dashboard : cortex/cortex
78+ service : cortex
79+ severity : warning
80+ support_group : workload-management
81+ annotations :
82+ summary : " `{{$labels.component}}` uses too much memory"
83+ description : >
84+ `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
85+ should use much less, so there may be a memory leak or other changes
86+ that are causing the memory usage to increase significantly.
87+ - alert : CortexCinderHighCPUUsage
88+ expr : rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5
89+ for : 5m
90+ labels :
91+ context : cpu
92+ dashboard : cortex/cortex
93+ service : cortex
94+ severity : warning
95+ support_group : workload-management
96+ annotations :
97+ summary : " `{{$labels.component}}` uses too much CPU"
98+ description : >
99+ `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
100+ it should use much less, so there may be a CPU leak or other changes
101+ that are causing the CPU usage to increase significantly.
102+ - alert : CortexCinderTooManyDBConnectionAttempts
103+ expr : rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1
104+ for : 5m
105+ labels :
106+ context : db
107+ dashboard : cortex/cortex
108+ service : cortex
109+ severity : warning
110+ support_group : workload-management
111+ annotations :
112+ summary : " `{{$labels.component}}` is trying to connect to the database too often"
113+ description : >
114+ `{{$labels.component}}` is trying to connect to the database too often. This may happen
115+ when the database is down or the connection parameters are misconfigured.
0 commit comments