Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,23 @@ For information on platform support and getting started, visit the official docu

## Support and Getting Help
Please open [an issue on the GitHub project](https://github.com/NVIDIA/gpu-operator/issues/new) for any questions. Your feedback is appreciated.

## vGPU License Visibility
When the operator configures a node for `vm-vgpu` workloads it now reports the license state directly through the Kubernetes API:

- Each vGPU node receives an annotation `nvidia.com/vgpu-license-statuses` that contains a JSON snapshot of the most recent `nvidia-smi vgpu -q` output, including per-device status and expiry timestamps.
- The `ClusterPolicy` resource exposes a `Licensed` condition that summarizes the state of every vGPU node. It turns `False` if any device is unlicensed or nearing expiry, and `Unknown` if data from the node-status-exporter is missing.

You can inspect the node-level data with:

```bash
kubectl get node <name> -o jsonpath='{.metadata.annotations.nvidia\.com/vgpu-license-statuses}'
```

and the cluster-level summary through:

```bash
kubectl get clusterpolicy gpu-cluster-policy -o jsonpath='{.status.conditions[?(@.type=="Licensed")]}'
```

This makes it easier for users and automation to diagnose misconfigured or expired licenses without shelling into the node.
1 change: 1 addition & 0 deletions assets/state-node-status-exporter/0300_clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ rules:
- get
- list
- watch
- patch
199 changes: 199 additions & 0 deletions cmd/nvidia-validator/license_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"bufio"
"bytes"
"context"
"fmt"
"os/exec"
"regexp"
"strings"
"time"

"github.com/NVIDIA/gpu-operator/internal/licenseinfo"
)

const (
licenseQueryCommand = "nvidia-smi"
licenseQueryArgs = "vgpu -q"
licenseSource = "nvidia-smi vgpu -q"
)

var (
gpuHeaderRegexp = regexp.MustCompile(`^GPU\s+([0-9A-Fa-fx:.]+)`)
keyValueRegexp = regexp.MustCompile(`^([^:]+):\s*(.*)$`)
expiryMarker = "Expiry:"
licenseTimeParse = []string{
"2006-1-2 15:04:05 MST",
"2006-01-02 15:04:05 MST",
"2006-1-2 15:04:05",
time.RFC3339,
time.RFC1123Z,
time.RFC1123,
}
)

// collectLicenseSnapshot runs nvidia-smi and parses vGPU license information.
// Errors are propagated and also captured in the returned snapshot so that callers
// can still surface diagnostic data to the cluster.
func collectLicenseSnapshot(ctx context.Context, now time.Time) (licenseinfo.Snapshot, error) {
snapshot := licenseinfo.NewSnapshot(nil, licenseSource, now)

output, err := runLicenseQuery(ctx)
if err != nil {
snapshot.Error = err.Error()
return snapshot, err
}

devices, parseErr := parseVGPULicenseOutput(output)
snapshot.Devices = devices
if parseErr != nil {
snapshot.Error = parseErr.Error()
return snapshot, parseErr
}
if len(devices) == 0 {
err := fmt.Errorf("no vGPU license information found in nvidia-smi output")
snapshot.Error = err.Error()
return snapshot, err
}

return snapshot, nil
}

func runLicenseQuery(ctx context.Context) (string, error) {
args := strings.Split(licenseQueryArgs, " ")
cmd := exec.CommandContext(ctx, licenseQueryCommand, args...)
var combined bytes.Buffer
cmd.Stdout = &combined
cmd.Stderr = &combined
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("%s failed: %w: %s", licenseSource, err, strings.TrimSpace(combined.String()))
}
return combined.String(), nil
}

func parseVGPULicenseOutput(output string) ([]licenseinfo.DeviceStatus, error) {
scanner := bufio.NewScanner(strings.NewReader(output))
var (
devices []licenseinfo.DeviceStatus
current *licenseinfo.DeviceStatus
)

for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}

if matches := gpuHeaderRegexp.FindStringSubmatch(line); len(matches) == 2 {
if current != nil {
devices = append(devices, *current)
}
current = &licenseinfo.DeviceStatus{ID: matches[1]}
continue
}

if current == nil {
continue
}

key, value := parseKeyValue(line)
switch key {
case "Product Name":
current.Product = value
case "License Status":
status, expiryCandidate := extractStatusAndExpiry(value)
current.Status = status
current.Licensed = isStatusLicensed(status)
if expiryCandidate != "" && current.Expiry == nil {
if ts, err := parseLicenseTimestamp(expiryCandidate); err == nil {
current.Expiry = &ts
} else if current.Message == "" {
current.Message = fmt.Sprintf("license expiry: %s", expiryCandidate)
}
}
case "License Expiry":
if ts, err := parseLicenseTimestamp(value); err == nil {
current.Expiry = &ts
} else if current.Message == "" {
current.Message = fmt.Sprintf("license expiry: %s", value)
}
case "vGPU Software Licensed":
if !isAffirmative(value) {
current.Licensed = false
}
}
}

if current != nil {
devices = append(devices, *current)
}

if err := scanner.Err(); err != nil {
return devices, fmt.Errorf("failed to parse license output: %w", err)
}
return devices, nil
}

func parseKeyValue(line string) (string, string) {
matches := keyValueRegexp.FindStringSubmatch(line)
if len(matches) != 3 {
return line, ""
}
return strings.TrimSpace(matches[1]), strings.TrimSpace(matches[2])
}

func extractStatusAndExpiry(value string) (string, string) {
idx := strings.Index(value, expiryMarker)
if idx == -1 {
return value, ""
}
status := strings.TrimSpace(strings.Trim(value[:idx], "()"))
expiry := strings.TrimSpace(strings.Trim(value[idx+len(expiryMarker):], "()"))
return status, expiry
}

func isStatusLicensed(status string) bool {
if status == "" {
return false
}
lower := strings.ToLower(status)
if strings.Contains(lower, "unlicensed") || strings.Contains(lower, "not licensed") || strings.Contains(lower, "expired") {
return false
}
return strings.Contains(lower, "licensed")
}

func isAffirmative(value string) bool {
lower := strings.ToLower(value)
return lower == "yes" || lower == "true"
}

func parseLicenseTimestamp(value string) (time.Time, error) {
val := strings.TrimSpace(value)
if val == "" || strings.EqualFold(val, "n/a") {
return time.Time{}, fmt.Errorf("empty expiry")
}
for _, layout := range licenseTimeParse {
if ts, err := time.Parse(layout, val); err == nil {
return ts.UTC(), nil
}
}
return time.Time{}, fmt.Errorf("unsupported expiry format: %s", value)
}
32 changes: 32 additions & 0 deletions cmd/nvidia-validator/license_collector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package main

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestParseVGPULicenseOutput(t *testing.T) {
sample := `
GPU 00000000:02:00.0
Product Name : NVIDIA A16
License Status : Licensed (Expiry: 2025-6-26 21:46:51 GMT)
License Expiry : 2025-6-26 21:46:51 GMT

GPU 00000000:82:00.0
Product Name : NVIDIA A16
License Status : Unlicensed
License Expiry : N/A
`
devices, err := parseVGPULicenseOutput(sample)
require.NoError(t, err)
require.Len(t, devices, 2)
require.Equal(t, "00000000:02:00.0", devices[0].ID)
require.True(t, devices[0].Licensed)
require.NotNil(t, devices[0].Expiry)
require.Equal(t, "Licensed", devices[0].Status)

require.Equal(t, "00000000:82:00.0", devices[1].ID)
require.False(t, devices[1].Licensed)
require.Nil(t, devices[1].Expiry)
}
3 changes: 3 additions & 0 deletions cmd/nvidia-validator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ const (
driverValidationCheckDelaySeconds = 60
// pluginValidationCheckDelaySeconds indicates the delay between two checks of the device plugin validation, in seconds
pluginValidationCheckDelaySeconds = 30
// licenseStatusCheckDelaySeconds indicates how often license annotations are refreshed.
licenseStatusCheckDelaySeconds = 60
)

// NodeMetrics contains the port of the metrics server and the
Expand Down Expand Up @@ -308,6 +310,7 @@ func (nm *NodeMetrics) Run() error {
go nm.watchDriverValidation()
go nm.watchDevicePluginValidation()
go nm.watchNVIDIAPCI()
go nm.watchLicenseAnnotations()

log.Printf("Running the metrics server, listening on :%d/metrics", nm.port)
http.Handle("/metrics", promhttp.Handler())
Expand Down
Loading
Loading