Skip to content

Commit ff26a37

Browse files
committed
mark deprecated metric label and add replacement
Signed-off-by: Jifei Wang <[email protected]>
1 parent af1bc68 commit ff26a37

File tree

2 files changed

+44
-4
lines changed

2 files changed

+44
-4
lines changed

cmd/scheduler/metrics.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
7575
)
7676
nodevGPUMemoryAllocatedDesc := prometheus.NewDesc(
7777
"GPUDeviceMemoryAllocated",
78-
"Device memory allocated for a certain GPU",
78+
"Device memory allocated for a certain GPU. The label devicecores will be deprecated in 2.8.0",
7979
[]string{"nodeid", "deviceuuid", "deviceidx", "devicecores"}, nil,
8080
)
8181
nodevGPUSharedNumDesc := prometheus.NewDesc(
@@ -91,7 +91,7 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
9191
)
9292
nodeGPUOverview := prometheus.NewDesc(
9393
"nodeGPUOverview",
94-
"GPU overview on a certain node",
94+
"GPU overview on a certain node. The label devicecores will be deprecated in 2.8.0",
9595
[]string{"nodeid", "deviceuuid", "deviceidx", "devicecores", "sharedcontainers", "devicememorylimit", "devicetype"}, nil,
9696
)
9797
nodeGPUMemoryPercentage := prometheus.NewDesc(

cmd/vGPUmonitor/metrics.go

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,34 @@ var (
105105
)
106106
ctrDeviceMemorydesc = prometheus.NewDesc(
107107
"Device_memory_desc_of_container",
108-
"Container device meory description",
108+
"Container device meory description. The label context, module, data and offset will be deprecated in 2.8.0.",
109109
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid", "context", "module", "data", "offset"}, nil,
110110
)
111111
ctrDeviceUtilizationdesc = prometheus.NewDesc(
112112
"Device_utilization_desc_of_container",
113113
"Container device utilization description",
114114
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil,
115115
)
116+
ctrDeviceMemoryContext = prometheus.NewDesc(
117+
"Device_memory_context_of_container",
118+
"Container device memory context description",
119+
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil,
120+
)
121+
ctrDeviceMemoryModule = prometheus.NewDesc(
122+
"Device_memory_module_of_container",
123+
"Container device memory module description",
124+
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil,
125+
)
126+
ctrDeviceMemoryData = prometheus.NewDesc(
127+
"Device_memory_data_of_container",
128+
"Container device memory data description",
129+
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil,
130+
)
131+
ctrDeviceMemoryOffset = prometheus.NewDesc(
132+
"Device_memory_data_of_container",
133+
"Container device memory data description",
134+
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil,
135+
)
116136
ctrDeviceLastKernelDesc = prometheus.NewDesc(
117137
"Device_last_kernel_of_container",
118138
"Container device last kernel description",
@@ -400,11 +420,31 @@ func (cc ClusterManagerCollector) collectContainerMetrics(ch chan<- prometheus.M
400420

401421
// Send memory-related metrics with additional labels
402422
memoryLabels := append(labels, fmt.Sprint(memoryContextSize), fmt.Sprint(memoryModuleSize), fmt.Sprint(memoryBufferSize), fmt.Sprint(memoryOffset))
403-
if err := sendMetric(ch, ctrDeviceMemorydesc, prometheus.CounterValue, float64(memoryTotal), memoryLabels...); err != nil {
423+
if err := sendMetric(ch, ctrDeviceMemorydesc, prometheus.GaugeValue, float64(memoryTotal), memoryLabels...); err != nil {
404424
klog.Errorf("Failed to send memory-related metrics for device %d in Pod %s/%s, Container %s: %v", i, pod.Namespace, pod.Name, ctr.Name, err)
405425
return err
406426
}
407427

428+
if err := sendMetric(ch, ctrDeviceMemoryContext, prometheus.GaugeValue, float64(memoryContextSize), labels...); err != nil {
429+
klog.Errorf("Failed to send memory context metrics for device %d in Pod %s/%s, Container %s: %v", i, pod.Namespace, pod.Name, ctr.Name, err)
430+
return err
431+
}
432+
433+
if err := sendMetric(ch, ctrDeviceMemoryModule, prometheus.GaugeValue, float64(memoryModuleSize), labels...); err != nil {
434+
klog.Errorf("Failed to send memory module metrics for device %d in Pod %s/%s, Container %s: %v", i, pod.Namespace, pod.Name, ctr.Name, err)
435+
return err
436+
}
437+
438+
if err := sendMetric(ch, ctrDeviceMemoryData, prometheus.GaugeValue, float64(memoryBufferSize), labels...); err != nil {
439+
klog.Errorf("Failed to send memory buffer metrics for device %d in Pod %s/%s, Container %s: %v", i, pod.Namespace, pod.Name, ctr.Name, err)
440+
return err
441+
}
442+
443+
if err := sendMetric(ch, ctrDeviceMemoryOffset, prometheus.GaugeValue, float64(memoryOffset), labels...); err != nil {
444+
klog.Errorf("Failed to send memory offset metrics for device %d in Pod %s/%s, Container %s: %v", i, pod.Namespace, pod.Name, ctr.Name, err)
445+
return err
446+
}
447+
408448
if err := sendMetric(ch, ctrDeviceUtilizationdesc, prometheus.GaugeValue, float64(smUtil), labels...); err != nil {
409449
klog.Errorf("Failed to send SM utilization metric for device %d in Pod %s/%s, Container %s: %v", i, pod.Namespace, pod.Name, ctr.Name, err)
410450
return err

0 commit comments

Comments
 (0)