Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
)
nodevGPUMemoryAllocatedDesc := prometheus.NewDesc(
"GPUDeviceMemoryAllocated",
"Device memory allocated for a certain GPU",
"Device memory allocated for a certain GPU. The label devicecores will be deprecated in 2.8.0",
[]string{"nodeid", "deviceuuid", "deviceidx", "devicecores"}, nil,
)
nodevGPUSharedNumDesc := prometheus.NewDesc(
Expand All @@ -91,7 +91,7 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
)
nodeGPUOverview := prometheus.NewDesc(
"nodeGPUOverview",
"GPU overview on a certain node",
"GPU overview on a certain node. The label devicecores will be deprecated in 2.8.0",
[]string{"nodeid", "deviceuuid", "deviceidx", "devicecores", "sharedcontainers", "devicememorylimit", "devicetype"}, nil,
)
nodeGPUMemoryPercentage := prometheus.NewDesc(
Expand Down
11 changes: 3 additions & 8 deletions cmd/vGPUmonitor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ var (
)
ctrDeviceMemorydesc = prometheus.NewDesc(
"Device_memory_desc_of_container",
"Container device meory description",
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid", "context", "module", "data", "offset"}, nil,
"Container device memory description.",
[]string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil,
)
ctrDeviceUtilizationdesc = prometheus.NewDesc(
"Device_utilization_desc_of_container",
Expand Down Expand Up @@ -378,10 +378,6 @@ func (cc ClusterManagerCollector) collectContainerMetrics(ch chan<- prometheus.M
// Collect device metrics
memoryTotal := c.Info.DeviceMemoryTotal(i)
memoryLimit := c.Info.DeviceMemoryLimit(i)
memoryContextSize := c.Info.DeviceMemoryContextSize(i)
memoryModuleSize := c.Info.DeviceMemoryModuleSize(i)
memoryBufferSize := c.Info.DeviceMemoryBufferSize(i)
memoryOffset := c.Info.DeviceMemoryOffset(i)
smUtil := c.Info.DeviceSmUtil(i)
lastKernelTime := c.Info.LastKernelTime()

Expand All @@ -399,8 +395,7 @@ func (cc ClusterManagerCollector) collectContainerMetrics(ch chan<- prometheus.M
}

// Send memory-related metrics with additional labels
memoryLabels := append(labels, fmt.Sprint(memoryContextSize), fmt.Sprint(memoryModuleSize), fmt.Sprint(memoryBufferSize), fmt.Sprint(memoryOffset))
if err := sendMetric(ch, ctrDeviceMemorydesc, prometheus.CounterValue, float64(memoryTotal), memoryLabels...); err != nil {
if err := sendMetric(ch, ctrDeviceMemorydesc, prometheus.GaugeValue, float64(memoryTotal)); err != nil {
klog.Errorf("Failed to send memory-related metrics for device %d in Pod %s/%s, Container %s: %v", i, pod.Namespace, pod.Name, ctr.Name, err)
return err
}
Expand Down
Loading