From 45211214499a2e621c0f4e8da87cf6a502ad1789 Mon Sep 17 00:00:00 2001 From: haitwang-cloud Date: Wed, 19 Feb 2025 16:32:52 +0800 Subject: [PATCH 1/5] feat: add initial implementation of nvidia plugin components and interfaces Signed-off-by: haitwang-cloud --- Makefile | 2 +- cmd/device-plugin/nvidia/main.go | 320 ++++---- cmd/device-plugin/nvidia/plugin-manager.go | 91 ++- cmd/device-plugin/nvidia/root.go | 85 ++ cmd/device-plugin/nvidia/vgpucfg.go | 14 +- docker/Dockerfile | 9 +- docker/Dockerfile.new | 79 ++ go.mod | 84 +- go.sum | 175 ++--- .../nvidiadevice/nvinternal/cdi/api.go | 41 - .../nvidiadevice/nvinternal/cdi/cdi.go | 193 ----- .../nvidiadevice/nvinternal/cdi/factory.go | 52 -- .../nvidiadevice/nvinternal/cdi/null.go | 59 -- .../nvidiadevice/nvinternal/cdi/options.go | 103 --- .../nvidiadevice/nvinternal/info/version.go | 64 -- .../nvidiadevice/nvinternal/plugin/api.go | 42 - .../nvinternal/plugin/manager/api.go | 41 - .../nvinternal/plugin/manager/factory.go | 152 ---- .../nvinternal/plugin/manager/null.go | 49 -- .../nvinternal/plugin/manager/nvml.go | 61 -- .../nvinternal/plugin/manager/options.go | 84 -- .../nvinternal/plugin/manager/tegra.go | 61 -- .../nvidiadevice/nvinternal/plugin/server.go | 671 ---------------- .../nvinternal/plugin/util_test.go | 156 ---- .../nvidiadevice/nvinternal/rm/allocate.go | 137 ---- .../nvinternal/rm/device_map_test.go | 583 -------------- .../nvidiadevice/nvinternal/rm/health_test.go | 100 --- .../nvidiadevice/nvinternal/rm/helper.go | 54 -- .../nvinternal/rm/nvml_devices_test.go | 179 ----- .../nvinternal/rm/nvml_manager.go | 120 --- .../nvidiadevice/nvinternal/rm/rm.go | 176 ----- .../nvidiadevice/nvinternal/rm/wsl_devices.go | 52 -- pkg/device/nvidia/device.go | 8 +- pkg/nvidia-plugin/api/config/v1/config.go | 160 ++++ pkg/nvidia-plugin/api/config/v1/consts.go | 72 ++ pkg/nvidia-plugin/api/config/v1/duration.go | 69 ++ pkg/nvidia-plugin/api/config/v1/flags.go | 190 +++++ pkg/nvidia-plugin/api/config/v1/flags_test.go | 246 ++++++ pkg/nvidia-plugin/api/config/v1/imex.go | 53 ++ pkg/nvidia-plugin/api/config/v1/imex_test.go | 83 ++ pkg/nvidia-plugin/api/config/v1/replicas.go | 355 +++++++++ .../api/config/v1/replicas_test.go | 482 ++++++++++++ pkg/nvidia-plugin/api/config/v1/resources.go | 196 +++++ pkg/nvidia-plugin/api/config/v1/sharing.go | 69 ++ pkg/nvidia-plugin/api/config/v1/strategy.go | 69 ++ pkg/nvidia-plugin/mps-control-daemon/main.go | 255 ++++++ .../mps-control-daemon/mount/mount-shm.go | 108 +++ .../mps-control-daemon/mps/daemon.go | 280 +++++++ .../mps-control-daemon/mps/device.go | 55 ++ .../mps-control-daemon/mps/device_test.go | 112 +++ .../mps-control-daemon/mps/log-tailer.go | 69 ++ .../mps-control-daemon/mps/manager.go | 112 +++ .../mps-control-daemon/mps/options.go | 29 + .../mps-control-daemon/mps/root.go | 59 ++ pkg/nvidia-plugin/pkg/cdi/api.go | 31 + .../pkg}/cdi/api_mock.go | 0 pkg/nvidia-plugin/pkg/cdi/cdi.go | 231 ++++++ pkg/nvidia-plugin/pkg/cdi/imex.go | 63 ++ pkg/nvidia-plugin/pkg/cdi/null.go | 43 + pkg/nvidia-plugin/pkg/cdi/options.go | 102 +++ pkg/nvidia-plugin/pkg/cuda/api.go | 119 +++ pkg/nvidia-plugin/pkg/cuda/cgo_helpers.go | 27 + pkg/nvidia-plugin/pkg/cuda/consts.go | 95 +++ pkg/nvidia-plugin/pkg/cuda/cuda.go | 176 +++++ pkg/nvidia-plugin/pkg/cuda/device.go | 17 + pkg/nvidia-plugin/pkg/cuda/result.go | 178 +++++ .../pkg/dependencies/dependencies.go | 7 + pkg/nvidia-plugin/pkg/flags/kubeclient.go | 114 +++ pkg/nvidia-plugin/pkg/flags/node.go | 46 ++ pkg/nvidia-plugin/pkg/imex/imex.go | 98 +++ pkg/nvidia-plugin/pkg/info/version.go | 48 ++ pkg/nvidia-plugin/pkg/lm/empty.go | 24 + pkg/nvidia-plugin/pkg/lm/imex.go | 182 +++++ pkg/nvidia-plugin/pkg/lm/imex_test.go | 57 ++ pkg/nvidia-plugin/pkg/lm/labeler.go | 45 ++ pkg/nvidia-plugin/pkg/lm/labels.go | 25 + pkg/nvidia-plugin/pkg/lm/list.go | 46 ++ pkg/nvidia-plugin/pkg/lm/machine-type.go | 53 ++ pkg/nvidia-plugin/pkg/lm/mig-strategy.go | 311 ++++++++ pkg/nvidia-plugin/pkg/lm/mig-strategy_test.go | 422 ++++++++++ pkg/nvidia-plugin/pkg/lm/nvml.go | 262 +++++++ pkg/nvidia-plugin/pkg/lm/nvml_test.go | 292 +++++++ pkg/nvidia-plugin/pkg/lm/output.go | 155 ++++ pkg/nvidia-plugin/pkg/lm/resource.go | 319 ++++++++ pkg/nvidia-plugin/pkg/lm/resource_test.go | 437 +++++++++++ pkg/nvidia-plugin/pkg/lm/strategy.go | 28 + pkg/nvidia-plugin/pkg/lm/timestamp.go | 37 + pkg/nvidia-plugin/pkg/lm/vgpu.go | 58 ++ pkg/nvidia-plugin/pkg/logger/klog.go | 34 + .../pkg/mig/mig-dp.go} | 34 +- pkg/nvidia-plugin/pkg/mig/mig.go | 124 +++ pkg/nvidia-plugin/pkg/plugin/api.go | 26 + pkg/nvidia-plugin/pkg/plugin/factory.go | 138 ++++ pkg/nvidia-plugin/pkg/plugin/mps.go | 91 +++ pkg/nvidia-plugin/pkg/plugin/options.go | 79 ++ .../pkg}/plugin/register.go | 2 +- .../pkg}/plugin/register_test.go | 14 +- pkg/nvidia-plugin/pkg/plugin/server.go | 742 ++++++++++++++++++ .../pkg}/plugin/server_test.go | 121 ++- .../pkg}/plugin/util.go | 2 +- pkg/nvidia-plugin/pkg/resource/cuda-device.go | 110 +++ pkg/nvidia-plugin/pkg/resource/cuda-lib.go | 88 +++ pkg/nvidia-plugin/pkg/resource/device_mock.go | 437 +++++++++++ pkg/nvidia-plugin/pkg/resource/factory.go | 84 ++ pkg/nvidia-plugin/pkg/resource/fallback.go | 64 ++ .../pkg/resource/fallback_test.go | 62 ++ .../pkg/resource/manager_mock.go | 215 +++++ pkg/nvidia-plugin/pkg/resource/null.go | 57 ++ pkg/nvidia-plugin/pkg/resource/nvml-device.go | 119 +++ pkg/nvidia-plugin/pkg/resource/nvml-lib.go | 94 +++ .../pkg/resource/nvml-mig-device.go | 152 ++++ .../pkg/resource/sysfs-device.go | 77 ++ pkg/nvidia-plugin/pkg/resource/sysfs-lib.go | 74 ++ .../pkg/resource/testing/resource-testing.go | 141 ++++ pkg/nvidia-plugin/pkg/resource/types.go | 45 ++ pkg/nvidia-plugin/pkg/rm/allocate.go | 80 ++ .../pkg}/rm/device_map.go | 125 +-- pkg/nvidia-plugin/pkg/rm/device_map_test.go | 109 +++ .../pkg}/rm/devices.go | 117 +-- .../pkg}/rm/health.go | 73 +- pkg/nvidia-plugin/pkg/rm/health_test.go | 74 ++ pkg/nvidia-plugin/pkg/rm/helper.go | 32 + .../pkg}/rm/nvml_devices.go | 89 ++- pkg/nvidia-plugin/pkg/rm/nvml_manager.go | 138 ++++ pkg/nvidia-plugin/pkg/rm/rm.go | 138 ++++ pkg/nvidia-plugin/pkg/rm/rm_test.go | 195 +++++ .../pkg}/rm/tegra_devices.go | 56 +- .../pkg}/rm/tegra_manager.go | 50 +- pkg/nvidia-plugin/pkg/rm/wsl_devices.go | 46 ++ pkg/nvidia-plugin/pkg/vgpu/pciutil.go | 204 +++++ pkg/nvidia-plugin/pkg/vgpu/pciutil_test.go | 42 + pkg/nvidia-plugin/pkg/vgpu/vgpu.go | 153 ++++ pkg/nvidia-plugin/pkg/vgpu/vgpu_test.go | 74 ++ pkg/nvidia-plugin/pkg/watch/watchers.go | 49 ++ version.mk | 32 +- 135 files changed, 12510 insertions(+), 3960 deletions(-) create mode 100644 cmd/device-plugin/nvidia/root.go create mode 100644 docker/Dockerfile.new delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/cdi/cdi.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/info/version.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/factory.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/plugin/util_test.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/allocate.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices_test.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/rm.go delete mode 100644 pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go create mode 100644 pkg/nvidia-plugin/api/config/v1/config.go create mode 100644 pkg/nvidia-plugin/api/config/v1/consts.go create mode 100644 pkg/nvidia-plugin/api/config/v1/duration.go create mode 100644 pkg/nvidia-plugin/api/config/v1/flags.go create mode 100644 pkg/nvidia-plugin/api/config/v1/flags_test.go create mode 100644 pkg/nvidia-plugin/api/config/v1/imex.go create mode 100644 pkg/nvidia-plugin/api/config/v1/imex_test.go create mode 100644 pkg/nvidia-plugin/api/config/v1/replicas.go create mode 100644 pkg/nvidia-plugin/api/config/v1/replicas_test.go create mode 100644 pkg/nvidia-plugin/api/config/v1/resources.go create mode 100644 pkg/nvidia-plugin/api/config/v1/sharing.go create mode 100644 pkg/nvidia-plugin/api/config/v1/strategy.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/main.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/device.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/manager.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/options.go create mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/root.go create mode 100644 pkg/nvidia-plugin/pkg/cdi/api.go rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/cdi/api_mock.go (100%) create mode 100644 pkg/nvidia-plugin/pkg/cdi/cdi.go create mode 100644 pkg/nvidia-plugin/pkg/cdi/imex.go create mode 100644 pkg/nvidia-plugin/pkg/cdi/null.go create mode 100644 pkg/nvidia-plugin/pkg/cdi/options.go create mode 100644 pkg/nvidia-plugin/pkg/cuda/api.go create mode 100644 pkg/nvidia-plugin/pkg/cuda/cgo_helpers.go create mode 100644 pkg/nvidia-plugin/pkg/cuda/consts.go create mode 100644 pkg/nvidia-plugin/pkg/cuda/cuda.go create mode 100644 pkg/nvidia-plugin/pkg/cuda/device.go create mode 100644 pkg/nvidia-plugin/pkg/cuda/result.go create mode 100644 pkg/nvidia-plugin/pkg/dependencies/dependencies.go create mode 100644 pkg/nvidia-plugin/pkg/flags/kubeclient.go create mode 100644 pkg/nvidia-plugin/pkg/flags/node.go create mode 100644 pkg/nvidia-plugin/pkg/imex/imex.go create mode 100644 pkg/nvidia-plugin/pkg/info/version.go create mode 100644 pkg/nvidia-plugin/pkg/lm/empty.go create mode 100644 pkg/nvidia-plugin/pkg/lm/imex.go create mode 100644 pkg/nvidia-plugin/pkg/lm/imex_test.go create mode 100644 pkg/nvidia-plugin/pkg/lm/labeler.go create mode 100644 pkg/nvidia-plugin/pkg/lm/labels.go create mode 100644 pkg/nvidia-plugin/pkg/lm/list.go create mode 100644 pkg/nvidia-plugin/pkg/lm/machine-type.go create mode 100644 pkg/nvidia-plugin/pkg/lm/mig-strategy.go create mode 100644 pkg/nvidia-plugin/pkg/lm/mig-strategy_test.go create mode 100644 pkg/nvidia-plugin/pkg/lm/nvml.go create mode 100644 pkg/nvidia-plugin/pkg/lm/nvml_test.go create mode 100644 pkg/nvidia-plugin/pkg/lm/output.go create mode 100644 pkg/nvidia-plugin/pkg/lm/resource.go create mode 100644 pkg/nvidia-plugin/pkg/lm/resource_test.go create mode 100644 pkg/nvidia-plugin/pkg/lm/strategy.go create mode 100644 pkg/nvidia-plugin/pkg/lm/timestamp.go create mode 100644 pkg/nvidia-plugin/pkg/lm/vgpu.go create mode 100644 pkg/nvidia-plugin/pkg/logger/klog.go rename pkg/{device-plugin/nvidiadevice/nvinternal/mig/mig.go => nvidia-plugin/pkg/mig/mig-dp.go} (70%) create mode 100644 pkg/nvidia-plugin/pkg/mig/mig.go create mode 100644 pkg/nvidia-plugin/pkg/plugin/api.go create mode 100644 pkg/nvidia-plugin/pkg/plugin/factory.go create mode 100644 pkg/nvidia-plugin/pkg/plugin/mps.go create mode 100644 pkg/nvidia-plugin/pkg/plugin/options.go rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/plugin/register.go (98%) rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/plugin/register_test.go (91%) create mode 100644 pkg/nvidia-plugin/pkg/plugin/server.go rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/plugin/server_test.go (58%) rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/plugin/util.go (99%) create mode 100644 pkg/nvidia-plugin/pkg/resource/cuda-device.go create mode 100644 pkg/nvidia-plugin/pkg/resource/cuda-lib.go create mode 100644 pkg/nvidia-plugin/pkg/resource/device_mock.go create mode 100644 pkg/nvidia-plugin/pkg/resource/factory.go create mode 100644 pkg/nvidia-plugin/pkg/resource/fallback.go create mode 100644 pkg/nvidia-plugin/pkg/resource/fallback_test.go create mode 100644 pkg/nvidia-plugin/pkg/resource/manager_mock.go create mode 100644 pkg/nvidia-plugin/pkg/resource/null.go create mode 100644 pkg/nvidia-plugin/pkg/resource/nvml-device.go create mode 100644 pkg/nvidia-plugin/pkg/resource/nvml-lib.go create mode 100644 pkg/nvidia-plugin/pkg/resource/nvml-mig-device.go create mode 100644 pkg/nvidia-plugin/pkg/resource/sysfs-device.go create mode 100644 pkg/nvidia-plugin/pkg/resource/sysfs-lib.go create mode 100644 pkg/nvidia-plugin/pkg/resource/testing/resource-testing.go create mode 100644 pkg/nvidia-plugin/pkg/resource/types.go create mode 100644 pkg/nvidia-plugin/pkg/rm/allocate.go rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/rm/device_map.go (73%) create mode 100644 pkg/nvidia-plugin/pkg/rm/device_map_test.go rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/rm/devices.go (71%) rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/rm/health.go (81%) create mode 100644 pkg/nvidia-plugin/pkg/rm/health_test.go create mode 100644 pkg/nvidia-plugin/pkg/rm/helper.go rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/rm/nvml_devices.go (65%) create mode 100644 pkg/nvidia-plugin/pkg/rm/nvml_manager.go create mode 100644 pkg/nvidia-plugin/pkg/rm/rm.go create mode 100644 pkg/nvidia-plugin/pkg/rm/rm_test.go rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/rm/tegra_devices.go (53%) rename pkg/{device-plugin/nvidiadevice/nvinternal => nvidia-plugin/pkg}/rm/tegra_manager.go (55%) create mode 100644 pkg/nvidia-plugin/pkg/rm/wsl_devices.go create mode 100644 pkg/nvidia-plugin/pkg/vgpu/pciutil.go create mode 100644 pkg/nvidia-plugin/pkg/vgpu/pciutil_test.go create mode 100644 pkg/nvidia-plugin/pkg/vgpu/vgpu.go create mode 100644 pkg/nvidia-plugin/pkg/vgpu/vgpu_test.go create mode 100644 pkg/nvidia-plugin/pkg/watch/watchers.go diff --git a/Makefile b/Makefile index 3dd3709b2..cf697b9c7 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,10 @@ docker: docker build \ --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \ --build-arg TARGET_ARCH=${TARGET_ARCH} \ + --build-arg NVIDIA_DEVEL_IMAGE=${NVIDIA_DEVEL_IMAGE} \ --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \ --build-arg DEST_DIR=${DEST_DIR} \ --build-arg VERSION=${VERSION} \ - --build-arg GOPROXY=https://goproxy.cn,direct \ . -f=docker/Dockerfile -t ${IMG_TAG} dockerwithlib: diff --git a/cmd/device-plugin/nvidia/main.go b/cmd/device-plugin/nvidia/main.go index c9d823b4c..cec7923ca 100644 --- a/cmd/device-plugin/nvidia/main.go +++ b/cmd/device-plugin/nvidia/main.go @@ -1,73 +1,61 @@ /* -Copyright 2024 The HAMi Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package main import ( "encoding/json" - "flag" + "errors" "fmt" "os" + "path/filepath" "syscall" "time" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/info" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" - "github.com/Project-HAMi/HAMi/pkg/util" - flagutil "github.com/Project-HAMi/HAMi/pkg/util/flag" - - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/fsnotify/fsnotify" - cli "github.com/urfave/cli/v2" - errorsutil "k8s.io/apimachinery/pkg/util/errors" + "github.com/urfave/cli/v2" "k8s.io/klog/v2" - kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/info" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/logger" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/plugin" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/watch" + "github.com/Project-HAMi/HAMi/pkg/util" ) -func main() { - var configFile string +type options struct { + flags []cli.Flag + configFile string + kubeletSocket string +} +func main() { c := cli.NewApp() + o := &options{} c.Name = "NVIDIA Device Plugin" c.Usage = "NVIDIA device plugin for Kubernetes" + c.Version = info.GetVersionString() c.Action = func(ctx *cli.Context) error { - flagutil.PrintCliFlags(ctx) - return start(ctx, c.Flags) - } - c.Commands = []*cli.Command{ - { - Name: "version", - Usage: "Show the version of NVIDIA Device Plugin", - Action: func(c *cli.Context) error { - fmt.Printf("%s version: %s\n", c.App.Name, info.GetVersionString()) - return nil - }, - }, - } - - flagset := flag.NewFlagSet("klog", flag.ExitOnError) - klog.InitFlags(flagset) - - c.Before = func(ctx *cli.Context) error { - logLevel := ctx.Int("v") - if err := flagset.Set("v", fmt.Sprintf("%d", logLevel)); err != nil { - return err - } - return nil + return start(ctx, o) } c.Flags = []cli.Flag{ @@ -84,11 +72,18 @@ func main() { EnvVars: []string{"FAIL_ON_INIT_ERROR"}, }, &cli.StringFlag{ - Name: "nvidia-driver-root", + Name: "driver-root", + Aliases: []string{"nvidia-driver-root"}, Value: "/", - Usage: "the root path for the NVIDIA driver installation (typical values are '/' or '/run/nvidia/driver')", + Usage: "the root path for the NVIDIA driver installation on the host (typical values are '/' or '/run/nvidia/driver')", EnvVars: []string{"NVIDIA_DRIVER_ROOT"}, }, + &cli.StringFlag{ + Name: "dev-root", + Aliases: []string{"nvidia-dev-root"}, + Usage: "the root path for the NVIDIA device nodes on the host (typical values are '/' or '/run/nvidia/driver')", + EnvVars: []string{"NVIDIA_DEV_ROOT"}, + }, &cli.BoolFlag{ Name: "pass-device-specs", Value: false, @@ -97,7 +92,7 @@ func main() { }, &cli.StringSliceFlag{ Name: "device-list-strategy", - Value: cli.NewStringSlice(string(spec.DeviceListStrategyEnvvar)), + Value: cli.NewStringSlice(string(spec.DeviceListStrategyEnvVar)), Usage: "the desired strategy for passing the device list to the underlying runtime:\n\t\t[envvar | volume-mounts | cdi-annotations]", EnvVars: []string{"DEVICE_LIST_STRATEGY"}, }, @@ -117,10 +112,17 @@ func main() { Usage: "ensure that containers are started with NVIDIA_MOFED=enabled", EnvVars: []string{"MOFED_ENABLED"}, }, + &cli.StringFlag{ + Name: "kubelet-socket", + Value: pluginapi.KubeletSocket, + Usage: "specify the socket for communicating with the kubelet; if this is empty, no connection with the kubelet is attempted", + Destination: &o.kubeletSocket, + EnvVars: []string{"KUBELET_SOCKET"}, + }, &cli.StringFlag{ Name: "config-file", Usage: "the path to a config file as an alternative to command line options or environment variables", - Destination: &configFile, + Destination: &o.configFile, EnvVars: []string{"CONFIG_FILE"}, }, &cli.StringFlag{ @@ -130,24 +132,45 @@ func main() { EnvVars: []string{"CDI_ANNOTATION_PREFIX"}, }, &cli.StringFlag{ - Name: "nvidia-ctk-path", + Name: "nvidia-cdi-hook-path", + Aliases: []string{"nvidia-ctk-path"}, Value: spec.DefaultNvidiaCTKPath, - Usage: "the path to use for the nvidia-ctk in the generated CDI specification", - EnvVars: []string{"NVIDIA_CTK_PATH"}, + Usage: "the path to use for NVIDIA CDI hooks in the generated CDI specification", + EnvVars: []string{"NVIDIA_CDI_HOOK_PATH", "NVIDIA_CTK_PATH"}, }, &cli.StringFlag{ - Name: "container-driver-root", + Name: "driver-root-ctr-path", + Aliases: []string{"container-driver-root"}, Value: spec.DefaultContainerDriverRoot, Usage: "the path where the NVIDIA driver root is mounted in the container; used for generating CDI specifications", - EnvVars: []string{"CONTAINER_DRIVER_ROOT"}, + EnvVars: []string{"DRIVER_ROOT_CTR_PATH", "CONTAINER_DRIVER_ROOT"}, + }, + &cli.StringFlag{ + Name: "mps-root", + Usage: "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager", + EnvVars: []string{"MPS_ROOT"}, + }, + &cli.StringFlag{ + Name: "device-discovery-strategy", + Value: "auto", + Usage: "the strategy to use to discover devices: 'auto', 'nvml', or 'tegra'", + EnvVars: []string{"DEVICE_DISCOVERY_STRATEGY"}, + }, + &cli.IntSliceFlag{ + Name: "imex-channel-ids", + Usage: "A list of IMEX channels to inject.", + EnvVars: []string{"IMEX_CHANNEL_IDS"}, }, - &cli.IntFlag{ - Name: "v", - Usage: "number for the log level verbosity", - Value: 0, + &cli.BoolFlag{ + Name: "imex-required", + Usage: "The specified IMEX channels are required", + EnvVars: []string{"IMEX_REQUIRED"}, }, } + // add extra flags for HAMi c.Flags = append(c.Flags, addFlags()...) + o.flags = c.Flags + err := c.Run(os.Args) if err != nil { klog.Error(err) @@ -155,15 +178,50 @@ func main() { } } -func validateFlags(config *spec.Config) error { - _, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) +func validateFlags(infolib nvinfo.Interface, config *spec.Config) error { + deviceListStrategies, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) if err != nil { return fmt.Errorf("invalid --device-list-strategy option: %v", err) } + hasNvml, _ := infolib.HasNvml() + if deviceListStrategies.AnyCDIEnabled() && !hasNvml { + return fmt.Errorf("CDI --device-list-strategy options are only supported on NVML-based systems") + } + if *config.Flags.Plugin.DeviceIDStrategy != spec.DeviceIDStrategyUUID && *config.Flags.Plugin.DeviceIDStrategy != spec.DeviceIDStrategyIndex { return fmt.Errorf("invalid --device-id-strategy option: %v", *config.Flags.Plugin.DeviceIDStrategy) } + + if config.Sharing.SharingStrategy() == spec.SharingStrategyMPS { + if *config.Flags.MigStrategy == spec.MigStrategyMixed { + return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS") + } + if config.Flags.MpsRoot == nil || *config.Flags.MpsRoot == "" { + return fmt.Errorf("using MPS requires --mps-root to be specified") + } + } + + switch *config.Flags.DeviceDiscoveryStrategy { + case "auto": + case "nvml": + case "tegra": + default: + return fmt.Errorf("invalid --device-discovery-strategy option %v", *config.Flags.DeviceDiscoveryStrategy) + } + + switch *config.Flags.MigStrategy { + case spec.MigStrategyNone: + case spec.MigStrategySingle: + case spec.MigStrategyMixed: + default: + return fmt.Errorf("unknown MIG strategy: %v", *config.Flags.MigStrategy) + } + + if err := spec.AssertChannelIDsValid(config.Imex.ChannelIDs); err != nil { + return fmt.Errorf("invalid IMEX channel IDs: %w", err) + } + return nil } @@ -172,35 +230,38 @@ func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) { if err != nil { return nil, fmt.Errorf("unable to finalize config: %v", err) } - err = validateFlags(config) - if err != nil { - return nil, fmt.Errorf("unable to validate flags: %v", err) - } config.Flags.GFD = nil return config, nil } -func start(c *cli.Context, flags []cli.Flag) error { - klog.Info("Starting FS watcher.") +func start(c *cli.Context, o *options) error { + klog.InfoS(fmt.Sprintf("Starting %s", c.App.Name), "version", c.App.Version) + util.NodeName = os.Getenv(util.NodeNameEnvName) - watcher, err := newFSWatcher(kubeletdevicepluginv1beta1.DevicePluginPath) + // watcher, err := newFSWatcher(kubeletdevicepluginv1beta1.DevicePluginPath) + // if err != nil { + // return fmt.Errorf("failed to create FS watcher: %v", err) + // } + // defer watcher.Close() + + kubeletSocketDir := filepath.Dir(o.kubeletSocket) + klog.Infof("Starting FS watcher for %v", kubeletSocketDir) + watcher, err := watch.Files(kubeletSocketDir) if err != nil { - return fmt.Errorf("failed to create FS watcher: %v", err) + return fmt.Errorf("failed to create FS watcher for %s: %v", pluginapi.DevicePluginPath, err) } defer watcher.Close() - //device.InitDevices() - /*Loading config files*/ klog.Infof("Start working on node %s", util.NodeName) klog.Info("Starting OS watcher.") - sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) + sigs := watch.Signals(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) - var restarting bool + var started bool var restartTimeout <-chan time.Time var plugins []plugin.Interface restart: // If we are restarting, stop plugins from previous run. - if restarting { + if started { err := stopPlugins(plugins) if err != nil { return fmt.Errorf("error stopping plugins from previous run: %v", err) @@ -208,18 +269,17 @@ restart: } klog.Info("Starting Plugins.") - plugins, restartPlugins, err := startPlugins(c, flags, restarting) + plugins, restartPlugins, err := startPlugins(c, o) if err != nil { return fmt.Errorf("error starting plugins: %v", err) } + started = true if restartPlugins { - klog.Info("Failed to start one or more plugins. Retrying in 30s...") + klog.Infof("Failed to start one or more plugins. Retrying in 30s...") restartTimeout = time.After(30 * time.Second) } - restarting = true - // Start an infinite loop, waiting for several indicators to either log // some messages, trigger a restart of the plugins, or exit the program. for { @@ -229,17 +289,17 @@ restart: goto restart // Detect a kubelet restart by watching for a newly created - // 'kubeletdevicepluginv1beta1.KubeletSocket' file. When this occurs, restart this loop, + // 'pluginapi.KubeletSocket' file. When this occurs, restart this loop, // restarting all of the plugins in the process. case event := <-watcher.Events: - if event.Name == kubeletdevicepluginv1beta1.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create { - klog.Infof("inotify: %s created, restarting.", kubeletdevicepluginv1beta1.KubeletSocket) + if o.kubeletSocket != "" && event.Name == o.kubeletSocket && event.Op&fsnotify.Create == fsnotify.Create { + klog.Infof("inotify: %s created, restarting.", o.kubeletSocket) goto restart } // Watch for any other fs errors and log them. case err := <-watcher.Errors: - klog.Errorf("inotify: %s", err) + klog.Infof("inotify: %s", err) // Watch for any signals from the OS. On SIGHUP, restart this loop, // restarting all of the plugins in the process. On all other @@ -263,32 +323,47 @@ exit: return nil } -func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.Interface, bool, error) { +func startPlugins(c *cli.Context, o *options) ([]plugin.Interface, bool, error) { // Load the configuration file klog.Info("Loading configuration.") - config, err := loadConfig(c, flags) + config, err := loadConfig(c, o.flags) if err != nil { return nil, false, fmt.Errorf("unable to load config: %v", err) } - disableResourceRenamingInConfig(config) + spec.DisableResourceNamingInConfig(logger.ToKlog, config) - /*Loading config files*/ - //fmt.Println("NodeName=", config.NodeName) - devConfig, err := generateDeviceConfigFromNvidia(config, c, flags) + devConfig, err := generateDeviceConfigFromNvidia(config, c, o.flags) if err != nil { klog.Errorf("failed to load config file %s", err.Error()) return nil, false, err } + driverRoot := root(*devConfig.Config.Flags.Plugin.ContainerDriverRoot) + // We construct an NVML library specifying the path to libnvidia-ml.so.1 + // explicitly so that we don't have to rely on the library path. + nvmllib := nvml.New( + nvml.WithLibraryPath(driverRoot.tryResolveLibrary("libnvidia-ml.so.1")), + ) + devicelib := device.New(nvmllib) + infolib := nvinfo.New( + nvinfo.WithNvmlLib(nvmllib), + nvinfo.WithDeviceLib(devicelib), + ) + + err = validateFlags(infolib, devConfig.Config) + if err != nil { + return nil, false, fmt.Errorf("unable to validate flags: %v", err) + } + // Update the configuration file with default resources. klog.Info("Updating config with default resource matching patterns.") - err = rm.AddDefaultResourcesToConfig(&devConfig) + err = rm.AddDefaultResourcesToConfig(infolib, nvmllib, devicelib, devConfig.Config) if err != nil { return nil, false, fmt.Errorf("unable to add default resources to config: %v", err) } // Print the config to the output. - configJSON, err := json.MarshalIndent(devConfig, "", " ") + configJSON, err := json.MarshalIndent(devConfig.Config, "", " ") if err != nil { return nil, false, fmt.Errorf("failed to marshal config to JSON: %v", err) } @@ -296,11 +371,7 @@ func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.I // Get the set of plugins. klog.Info("Retrieving plugins.") - pluginManager, err := NewPluginManager(&devConfig) - if err != nil { - return nil, false, fmt.Errorf("error creating plugin manager: %v", err) - } - plugins, err := pluginManager.GetPlugins() + plugins, err := GetPlugins(infolib, nvmllib, devicelib, devConfig) if err != nil { return nil, false, fmt.Errorf("error getting plugins: %v", err) } @@ -316,10 +387,8 @@ func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.I } // Start the gRPC server for plugin p and connect it with the kubelet. - if err := p.Start(); err != nil { - klog.Error("Could not contact Kubelet. Did you enable the device plugin feature gate?") - klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") - klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + if err := p.Start(o.kubeletSocket); err != nil { + klog.Errorf("Failed to start plugin: %v", err) return plugins, true, nil } started++ @@ -334,48 +403,9 @@ func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.I func stopPlugins(plugins []plugin.Interface) error { klog.Info("Stopping plugins.") - errs := []error{} + var errs error for _, p := range plugins { - err := p.Stop() - errs = append(errs, err) - } - return errorsutil.NewAggregate(errs) -} - -// disableResourceRenamingInConfig temporarily disable the resource renaming feature of the plugin. -// We plan to reeenable this feature in a future release. -func disableResourceRenamingInConfig(config *spec.Config) { - // Disable resource renaming through config.Resource - if len(config.Resources.GPUs) > 0 || len(config.Resources.MIGs) > 0 { - klog.Infof("Customizing the 'resources' field is not yet supported in the config. Ignoring...") - } - config.Resources.GPUs = nil - config.Resources.MIGs = nil - - // Disable renaming / device selection in Sharing.TimeSlicing.Resources - renameByDefault := config.Sharing.TimeSlicing.RenameByDefault - setsNonDefaultRename := false - setsDevices := false - for i, r := range config.Sharing.TimeSlicing.Resources { - if !renameByDefault && r.Rename != "" { - setsNonDefaultRename = true - config.Sharing.TimeSlicing.Resources[i].Rename = "" - } - if renameByDefault && r.Rename != r.Name.DefaultSharedRename() { - setsNonDefaultRename = true - config.Sharing.TimeSlicing.Resources[i].Rename = r.Name.DefaultSharedRename() - } - if !r.Devices.All { - setsDevices = true - config.Sharing.TimeSlicing.Resources[i].Devices.All = true - config.Sharing.TimeSlicing.Resources[i].Devices.Count = 0 - config.Sharing.TimeSlicing.Resources[i].Devices.List = nil - } - } - if setsNonDefaultRename { - klog.Warning("Setting the 'rename' field in sharing.timeSlicing.resources is not yet supported in the config. Ignoring...") - } - if setsDevices { - klog.Warning("Customizing the 'devices' field in sharing.timeSlicing.resources is not yet supported in the config. Ignoring...") + errs = errors.Join(errs, p.Stop()) } + return errs } diff --git a/cmd/device-plugin/nvidia/plugin-manager.go b/cmd/device-plugin/nvidia/plugin-manager.go index 237f6b759..02c435b17 100644 --- a/cmd/device-plugin/nvidia/plugin-manager.go +++ b/cmd/device-plugin/nvidia/plugin-manager.go @@ -1,82 +1,81 @@ /* -Copyright 2024 The HAMi Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package main import ( "fmt" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/NVIDIA/go-nvlib/pkg/nvml" - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/cdi" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/plugin" ) -// NewPluginManager creates an NVML-based plugin manager. -func NewPluginManager(config *nvidia.DeviceConfig) (manager.Interface, error) { - var err error - switch *config.Flags.MigStrategy { - case spec.MigStrategyNone: - case spec.MigStrategySingle: - case spec.MigStrategyMixed: - default: - return nil, fmt.Errorf("unknown strategy: %v", *config.Flags.MigStrategy) - } - - nvmllib := nvml.New() +// GetPlugins returns a set of plugins for the specified configuration. +func GetPlugins(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, config *nvidia.DeviceConfig) ([]plugin.Interface, error) { + // TODO: We could consider passing this as an argument since it should already be used to construct nvmllib. + driverRoot := root(*config.Flags.Plugin.ContainerDriverRoot) deviceListStrategies, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) if err != nil { return nil, fmt.Errorf("invalid device list strategy: %v", err) } - cdiEnabled := deviceListStrategies.IsCDIEnabled() + imexChannels, err := imex.GetChannels(config.Config, driverRoot.getDevRoot()) + if err != nil { + return nil, fmt.Errorf("error querying IMEX channels: %w", err) + } - cdiHandler, err := cdi.New( - cdi.WithEnabled(cdiEnabled), - cdi.WithDriverRoot(*config.Flags.Plugin.ContainerDriverRoot), + cdiHandler, err := cdi.New(infolib, nvmllib, devicelib, + cdi.WithDeviceListStrategies(deviceListStrategies), + cdi.WithDriverRoot(string(driverRoot)), + cdi.WithDevRoot(driverRoot.getDevRoot()), cdi.WithTargetDriverRoot(*config.Flags.NvidiaDriverRoot), + cdi.WithTargetDevRoot(*config.Flags.NvidiaDevRoot), cdi.WithNvidiaCTKPath(*config.Flags.Plugin.NvidiaCTKPath), - cdi.WithNvml(nvmllib), cdi.WithDeviceIDStrategy(*config.Flags.Plugin.DeviceIDStrategy), cdi.WithVendor("k8s.device-plugin.nvidia.com"), cdi.WithGdsEnabled(*config.Flags.GDSEnabled), cdi.WithMofedEnabled(*config.Flags.MOFEDEnabled), + cdi.WithImexChannels(imexChannels), ) if err != nil { return nil, fmt.Errorf("unable to create cdi handler: %v", err) } - m, err := manager.New( - manager.WithNVML(nvmllib), - manager.WithCDIEnabled(cdiEnabled), - manager.WithCDIHandler(cdiHandler), - manager.WithConfig(config), - manager.WithFailOnInitError(*config.Flags.FailOnInitError), - manager.WithMigStrategy(*config.Flags.MigStrategy), + plugins, err := plugin.New(infolib, nvmllib, devicelib, + plugin.WithCDIHandler(cdiHandler), + plugin.WithConfig(config), + plugin.WithDeviceListStrategies(deviceListStrategies), + plugin.WithFailOnInitError(*config.Flags.FailOnInitError), + plugin.WithImexChannels(imexChannels), ) if err != nil { - return nil, fmt.Errorf("unable to create plugin manager: %v", err) + return nil, fmt.Errorf("unable to create plugins: %w", err) } - if err := m.CreateCDISpecFile(); err != nil { + if err := cdiHandler.CreateSpecFile(); err != nil { return nil, fmt.Errorf("unable to create cdi spec file: %v", err) } - return m, nil + return plugins, nil } diff --git a/cmd/device-plugin/nvidia/root.go b/cmd/device-plugin/nvidia/root.go new file mode 100644 index 000000000..db9cec76e --- /dev/null +++ b/cmd/device-plugin/nvidia/root.go @@ -0,0 +1,85 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package main + +import ( + "fmt" + "os" + "path/filepath" +) + +type root string + +func (r root) join(parts ...string) string { + return filepath.Join(append([]string{string(r)}, parts...)...) +} + +// getDevRoot returns the dev root associated with the root. +// If the root is not a dev root, this defaults to "/". +func (r root) getDevRoot() string { + if r.isDevRoot() { + return string(r) + } + return "/" +} + +// isDevRoot checks whether the specified root is a dev root. +// A dev root is defined as a root containing a /dev folder. +func (r root) isDevRoot() bool { + stat, err := os.Stat(filepath.Join(string(r), "dev")) + if err != nil { + return false + } + return stat.IsDir() +} + +func (r root) tryResolveLibrary(libraryName string) string { + if r == "" || r == "/" { + return libraryName + } + + librarySearchPaths := []string{ + "/usr/lib64", + "/usr/lib/x86_64-linux-gnu", + "/usr/lib/aarch64-linux-gnu", + "/lib64", + "/lib/x86_64-linux-gnu", + "/lib/aarch64-linux-gnu", + } + + for _, d := range librarySearchPaths { + l := r.join(d, libraryName) + resolved, err := resolveLink(l) + if err != nil { + continue + } + return resolved + } + + return libraryName +} + +// resolveLink finds the target of a symlink or the file itself in the +// case of a regular file. +// This is equivalent to running `readlink -f ${l}`. +func resolveLink(l string) (string, error) { + resolved, err := filepath.EvalSymlinks(l) + if err != nil { + return "", fmt.Errorf("error resolving link '%v': %w", l, err) + } + return resolved, nil +} diff --git a/cmd/device-plugin/nvidia/vgpucfg.go b/cmd/device-plugin/nvidia/vgpucfg.go index a6d2a8307..15c523c99 100644 --- a/cmd/device-plugin/nvidia/vgpucfg.go +++ b/cmd/device-plugin/nvidia/vgpucfg.go @@ -21,14 +21,14 @@ import ( "os" "strings" + cli "github.com/urfave/cli/v2" + "k8s.io/klog/v2" + "github.com/Project-HAMi/HAMi/pkg/device" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/plugin" "github.com/Project-HAMi/HAMi/pkg/util" - - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" - cli "github.com/urfave/cli/v2" - "k8s.io/klog/v2" ) func addFlags() []cli.Flag { @@ -97,8 +97,8 @@ func updateFromCLIFlag[T any](pflag **T, c *cli.Context, flagName string) { } } -func generateDeviceConfigFromNvidia(cfg *spec.Config, c *cli.Context, flags []cli.Flag) (nvidia.DeviceConfig, error) { - devcfg := nvidia.DeviceConfig{} +func generateDeviceConfigFromNvidia(cfg *spec.Config, c *cli.Context, flags []cli.Flag) (*nvidia.DeviceConfig, error) { + devcfg := &nvidia.DeviceConfig{} devcfg.Config = cfg klog.Infoln("flags=", flags) diff --git a/docker/Dockerfile b/docker/Dockerfile index addf74a25..e513b9fe2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,5 +1,6 @@ -ARG GOLANG_IMAGE=golang:1.22.5-bullseye -ARG NVIDIA_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 +ARG GOLANG_IMAGE +ARG NVIDIA_IMAGE +ARG NVIDIA_DEVEL_IMAGE FROM $GOLANG_IMAGE AS build FROM $GOLANG_IMAGE AS gobuild @@ -11,14 +12,14 @@ ADD . /k8s-vgpu RUN cd /k8s-vgpu && make all VERSION=$VERSION RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0 -FROM $NVIDIA_IMAGE AS nvbuild +FROM $NVIDIA_DEVEL_IMAGE AS nvbuild COPY ./libvgpu /libvgpu WORKDIR /libvgpu ENV DEBIAN_FRONTEND=noninteractive RUN apt-get -y update; apt-get -y install cmake RUN bash ./build.sh -FROM nvidia/cuda:12.6.3-base-ubuntu22.04 +FROM $NVIDIA_IMAGE ENV NVIDIA_DISABLE_REQUIRE="true" ENV NVIDIA_VISIBLE_DEVICES=all ENV NVIDIA_DRIVER_CAPABILITIES=utility diff --git a/docker/Dockerfile.new b/docker/Dockerfile.new new file mode 100644 index 000000000..33e8bcfb3 --- /dev/null +++ b/docker/Dockerfile.new @@ -0,0 +1,79 @@ +# Base image upgrade to UBI 9 with CUDA 12.6.3 +ARG GOLANG_VERSION=1.22.6 +FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi9 AS build + +# Install essential build tools +RUN yum install -y \ + wget make git gcc \ + && \ + rm -rf /var/cache/yum/* + +# Install Go manually for better version control +RUN set -eux; \ + arch="$(uname -m)"; \ + case "${arch##*-}" in \ + x86_64 | amd64) ARCH='amd64' ;; \ + aarch64) ARCH='arm64' ;; \ + *) echo "unsupported architecture" ; exit 1 ;; \ + esac; \ + wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \ + | tar -C /usr/local -xz + +ENV GOPATH /go +ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH + +# Main build process +WORKDIR /build +COPY . . +RUN mkdir /artifacts +ARG VERSION="N/A" +ARG GIT_COMMIT="unknown" + +# Core components build +RUN cd /k8s-vgpu && make all VERSION=$VERSION PREFIX=/artifacts + +# Install NVIDIA MIG tool +RUN go install github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted@v0.10.0 + +# GPU library build stage +FROM nvidia/cuda:12.2.0-devel-ubuntu20.04 AS nvbuild +COPY ./libvgpu /libvgpu +WORKDIR /libvgpu +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get -y update && apt-get -y install cmake +RUN bash ./build.sh + +# Image cleanup stage +FROM redhat/ubi9-minimal:latest AS minimal +RUN rpm -qa --queryformat='^%{NAME}-\[0-9\].*\.%{ARCH}$\n' | sort -u > /tmp/package-names.minimal + +FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi9 AS base +WORKDIR /cleanup +COPY --from=minimal /tmp/package-names.minimal . +COPY deployments/container/cleanup/* . +RUN ./cleanup.sh + +# Final image composition +FROM base +ENV NVIDIA_DISABLE_REQUIRE="true" \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility + +# Metadata labels +LABEL version="$VERSION" \ + maintainer="opensource@4paradigm.com" \ + io.k8s.display-name="HAMi vGPU Plugin" \ + vendor="4paradigm" \ + com.nvidia.git-commit=${GIT_COMMIT} + +# File system organization +COPY ./LICENSE /k8s-vgpu/LICENSE +COPY --from=build /artifacts/ /k8s-vgpu/bin/ +COPY --from=build /go/bin/nvidia-mig-parted /k8s-vgpu/bin/ +COPY --from=nvbuild /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/libvgpu.so."$VERSION" +COPY ./docker/*.sh /k8s-vgpu/bin/ +COPY ./lib /k8s-vgpu/lib + +# Runtime configuration +ENV PATH="/k8s-vgpu/bin:${PATH}" +ENTRYPOINT ["entrypoint.sh"] \ No newline at end of file diff --git a/go.mod b/go.mod index dacc9cb0c..a0edd200e 100644 --- a/go.mod +++ b/go.mod @@ -3,59 +3,68 @@ module github.com/Project-HAMi/HAMi go 1.22.2 require ( - github.com/NVIDIA/go-gpuallocator v0.3.2 - github.com/NVIDIA/go-nvlib v0.2.0 - github.com/NVIDIA/go-nvml v0.12.0-3 - github.com/NVIDIA/k8s-device-plugin v0.15.0 - github.com/NVIDIA/nvidia-container-toolkit v1.15.0 - github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a + github.com/NVIDIA/go-gpuallocator v0.5.0 + github.com/NVIDIA/go-nvlib v0.7.1 + github.com/NVIDIA/go-nvml v0.12.4-1 + github.com/NVIDIA/nvidia-container-toolkit v1.17.2 github.com/fsnotify/fsnotify v1.7.0 github.com/google/uuid v1.6.0 github.com/julienschmidt/httprouter v1.3.0 - github.com/onsi/ginkgo/v2 v2.17.1 - github.com/onsi/gomega v1.32.0 + github.com/onsi/ginkgo/v2 v2.17.2 + github.com/onsi/gomega v1.33.1 github.com/opencontainers/runtime-spec v1.2.0 + github.com/opencontainers/selinux v1.11.0 github.com/prometheus/client_golang v1.18.0 github.com/sirupsen/logrus v1.9.3 github.com/spf13/cobra v1.8.1 - github.com/stretchr/testify v1.9.0 - github.com/urfave/cli/v2 v2.27.1 - golang.org/x/net v0.35.0 + github.com/stretchr/testify v1.10.0 + github.com/urfave/cli/v2 v2.27.5 golang.org/x/term v0.29.0 golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d - google.golang.org/grpc v1.63.2 - google.golang.org/protobuf v1.33.0 + google.golang.org/grpc v1.65.0 + google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v2 v2.4.0 gotest.tools/v3 v3.5.1 - k8s.io/api v0.29.3 - k8s.io/apimachinery v0.29.3 - k8s.io/client-go v0.29.3 - k8s.io/klog/v2 v2.120.1 + k8s.io/api v0.31.1 + k8s.io/apimachinery v0.31.1 + k8s.io/client-go v0.31.1 + k8s.io/klog/v2 v2.130.1 k8s.io/kube-scheduler v0.28.3 - k8s.io/kubelet v0.29.3 + k8s.io/kubelet v0.31.1 + k8s.io/mount-utils v0.31.1 sigs.k8s.io/controller-runtime v0.16.3 - tags.cncf.io/container-device-interface v0.7.1 + sigs.k8s.io/node-feature-discovery v0.15.4 + tags.cncf.io/container-device-interface v0.8.0 + tags.cncf.io/container-device-interface/specs-go v0.8.0 +) + +require ( + github.com/coreos/go-systemd/v22 v22.5.0 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/godbus/dbus/v5 v5.1.0 // indirect + github.com/moby/sys/mountinfo v0.7.1 // indirect + golang.org/x/net v0.35.0 // indirect ) require ( github.com/beorn7/perks v1.0.1 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.3 // indirect github.com/evanphx/json-patch v5.9.0+incompatible // indirect - github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/logr v1.4.2 // indirect github.com/go-openapi/jsonpointer v0.20.2 // indirect github.com/go-openapi/jsonreference v0.20.4 // indirect github.com/go-openapi/swag v0.22.9 // indirect - github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect + github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 // indirect + github.com/google/renameio v1.0.1 github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect @@ -67,35 +76,33 @@ require ( github.com/opencontainers/runc v1.1.14 // indirect github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.0 // indirect github.com/prometheus/common v0.48.0 // indirect - github.com/prometheus/procfs v0.13.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.5 github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect - github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect - golang.org/x/mod v0.17.0 // indirect - golang.org/x/oauth2 v0.17.0 // indirect + github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect + golang.org/x/mod v0.20.0 + golang.org/x/oauth2 v0.21.0 // indirect golang.org/x/sys v0.30.0 // indirect golang.org/x/text v0.22.0 // indirect golang.org/x/time v0.5.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/appengine v1.6.8 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 // indirect - k8s.io/utils v0.0.0-20240102154912-e7106e64919e // indirect + k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect + k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect - tags.cncf.io/container-device-interface/specs-go v0.7.0 // indirect + sigs.k8s.io/yaml v1.4.0 ) replace ( github.com/Project-HAMi/HAMi/pkg/api => ./pkg/api - github.com/Project-HAMi/HAMi/pkg/device-plugin => ./pkg/device-plugin + // github.com/Project-HAMi/HAMi/pkg/device-plugin => ./pkg/device-plugin github.com/Project-HAMi/HAMi/test/utils => ./test/utils k8s.io/api => k8s.io/api v0.28.3 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.28.3 @@ -119,4 +126,5 @@ replace ( k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.28.3 k8s.io/metrics => k8s.io/metrics v0.28.3 k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.28.3 +// github.com/Project-HAMi/HAMi/pkg/device-plugin => ./pkg/nvidia-plugin ) diff --git a/go.sum b/go.sum index de9e20bbc..f9092b7c3 100644 --- a/go.sum +++ b/go.sum @@ -1,37 +1,34 @@ -github.com/NVIDIA/go-gpuallocator v0.3.2 h1:gXaGgFKrtsBOvbZTZIWQ81yr7voHm5keRCXb3VNjMMU= -github.com/NVIDIA/go-gpuallocator v0.3.2/go.mod h1:OuqBvWRrs9+A783a753fK9YYP8P1BTf+T4Map+XfTUs= -github.com/NVIDIA/go-nvlib v0.2.0 h1:roq+SDstbP1fcy2XVH7wB2Gz2/Ud7Q+NGQYOcVITVrA= -github.com/NVIDIA/go-nvlib v0.2.0/go.mod h1:kFuLNTyD1tF6FbRFlk+/EdUW5BrkE+v1Y3A3/9zKSjA= -github.com/NVIDIA/go-nvml v0.12.0-3 h1:QwfjYxEqIQVRhl8327g2Y3ZvKResPydpGSKtCIIK9jE= -github.com/NVIDIA/go-nvml v0.12.0-3/go.mod h1:SOufGc5Wql+cxrIZ8RyJwVKDYxfbs4WPkHXqadcbfvA= -github.com/NVIDIA/k8s-device-plugin v0.15.0 h1:QKfAo6Xpl5M4Y9hltlYrzHjwGR+vfeAuiiNNyFN4DoE= -github.com/NVIDIA/k8s-device-plugin v0.15.0/go.mod h1:s6DHR9QG5+xAbWG7NniWTnrZI7wUojl1/hxeZClXm/U= -github.com/NVIDIA/nvidia-container-toolkit v1.15.0 h1:YmYZUKJzhz/lJSVH6k1mk5IUCHpt8HwRtwMrtBoCzhQ= -github.com/NVIDIA/nvidia-container-toolkit v1.15.0/go.mod h1:SUwxfwi+dl1LtVlpAnJEolxuZfCtAVmOKRGWhJYsiJI= +github.com/NVIDIA/go-gpuallocator v0.5.0 h1:166ICvPv2dU9oZ2J3kJ4y3XdbGCi6LhXgFZJtrqeu3A= +github.com/NVIDIA/go-gpuallocator v0.5.0/go.mod h1:zos5bTIN01hpQioOyu9oRKglrznImMQvm0bZllMmckw= +github.com/NVIDIA/go-nvlib v0.7.1 h1:7HHPZxoCjSLm1NgaRRjuhI8ffMCpc5Vgpg5yxQYUff8= +github.com/NVIDIA/go-nvlib v0.7.1/go.mod h1:2Kh2kYSP5IJ8EKf0/SYDzHiQKb9EJkwOf2LQzu6pXzY= +github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc= +github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= +github.com/NVIDIA/nvidia-container-toolkit v1.17.2 h1:iE6PK9SQH3HyDrOolu27xn3CJgURR3bDtnbfFrxdML8= +github.com/NVIDIA/nvidia-container-toolkit v1.17.2/go.mod h1:R6bNf6ca0IjjACa0ncKGvsrx6zSjsgz8QkFyBDk5szU= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a h1:sP3PcgyIkRlHqfF3Jfpe/7G8kf/qpzG4C8r94y9hLbE= -github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a/go.mod h1:xMRa4fJgXzSDFUCURSimOUgoSc+odohvO3uXT9xjqH0= -github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc= +github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.11.3 h1:yagOQz/38xJmcNeZJtrUcKjkHRltIaIFXKWeG1SkWGE= github.com/emicklei/go-restful/v3 v3.11.3/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo= github.com/go-logr/zapr v1.2.4/go.mod h1:FyHWQIzQORZ0QVE1BtVHv3cKtNLuXsbNLtpuhNapBOA= github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q= @@ -40,27 +37,29 @@ github.com/go-openapi/jsonreference v0.20.4 h1:bKlDxQxQJgwpUSgOENiMPzCTBVuc7vTdX github.com/go-openapi/jsonreference v0.20.4/go.mod h1:5pZJyJP2MnYCpoeoMAql78cCHauHj0V9Lhc506VOpw4= github.com/go-openapi/swag v0.22.9 h1:XX2DssF+mQKM2DHsbgZK74y/zj4mo9I99+89xUmuZCE= github.com/go-openapi/swag v0.22.9/go.mod h1:3/OXnFfnMAwBD099SwYRk7GD3xOrr1iL7d/XNLXVVwE= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= +github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 h1:k7nVchz72niMH6YLQNvHSdIE7iqsQxK1P41mySCvssg= +github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= +github.com/google/renameio v1.0.1 h1:Lh/jXZmvZxb0BBeSY5VKEfidcbcbenKjZFzM/q0fSeU= +github.com/google/renameio v1.0.1/go.mod h1:t/HQoYBZSsWSNK35C6CO/TpPLDVWvxOHboWUAweKUpk= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -69,7 +68,6 @@ github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= -github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -89,6 +87,8 @@ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34fGzaAZGFW22KVZDfyrYW+QABMrWnJBnSs= +github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g= +github.com/moby/sys/mountinfo v0.7.1/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -97,10 +97,10 @@ github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjY github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8= -github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= -github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= -github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= +github.com/onsi/ginkgo/v2 v2.17.2 h1:7eMhcy3GimbsA3hEnVKdw/PQM9XN9krpKVXsZdph0/g= +github.com/onsi/ginkgo/v2 v2.17.2/go.mod h1:nP2DPOQoNsQmsVyv5rDA8JkXQoCs6goXIvr/PRJ1eCc= +github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= +github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= github.com/opencontainers/runc v1.1.14 h1:rgSuzbmgz5DUJjeSnw337TxDbRuqjs6iqQck/2weR6w= github.com/opencontainers/runc v1.1.14/go.mod h1:E4C2z+7BxR7GHXp0hAY53mek+x49X1LjPNeMTfRGvOA= github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= @@ -113,18 +113,19 @@ github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaL github.com/opencontainers/selinux v1.11.0/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos= github.com/prometheus/client_model v0.6.0/go.mod h1:NTQHnmxFpouOD0DpvP4XujX3CdOAGQPoaGhyTchlyt8= github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE= github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= -github.com/prometheus/procfs v0.13.0 h1:GqzLlQyfsPbaEHaQkO7tbDlriv/4o5Hudv6OXHGKX7o= -github.com/prometheus/procfs v0.13.0/go.mod h1:cd4PFCR54QLnGKPaKGA6l+cfuNXtht43ZKY6tow0Y1g= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= @@ -137,15 +138,14 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/urfave/cli v1.19.1/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= -github.com/urfave/cli/v2 v2.27.1 h1:8xSQ6szndafKVRmfyeUMxkNUJQMjL1F2zmsZ+qHpfho= -github.com/urfave/cli/v2 v2.27.1/go.mod h1:8qnjx1vcq5s2/wpsqoZFndg2CE5tNFyrTvS6SinrnYQ= +github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w= +github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo= github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= @@ -153,11 +153,10 @@ github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHo github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= -github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU= -github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8= +github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= +github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.25.0 h1:4Hvk6GtkucQ790dqmj7l1eEnRdKm3k3ZUrUMS2d5+5c= @@ -165,67 +164,38 @@ go.uber.org/zap v1.25.0/go.mod h1:JIAUzQIH94IC4fOJQm7gMmBJP5k7wQfdcnYdPoEXJYk= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/exp v0.0.0-20231206192017-f3f8817b8deb h1:c0vyKkb6yr3KR7jEfJaOSv4lG7xPkbN6r52aJz1d8a8= +golang.org/x/exp v0.0.0-20231206192017-f3f8817b8deb/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= -golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= -golang.org/x/oauth2 v0.17.0 h1:6m3ZPmLEFdVxKKWnKq4VqZ60gutO35zm+zrAHVmHyDQ= -golang.org/x/oauth2 v0.17.0/go.mod h1:OzPDGQiuQMguemayvdylqddI7qcD9lnSDb+1FiwQ5HA= +golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= +golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= +golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= -golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= -golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= -golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= @@ -234,7 +204,6 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -243,16 +212,12 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1:cZGRis4/ot9uVm639a+rHCUaG0JJHEsdyzSQTMX+suY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= -google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= -google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 h1:BwIjyKYGsK9dMCBOorzRri8MQwmi7mT9rGHsCEinZkA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= +google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -272,25 +237,29 @@ k8s.io/apimachinery v0.28.3 h1:B1wYx8txOaCQG0HmYF6nbpU8dg6HvA06x5tEffvOe7A= k8s.io/apimachinery v0.28.3/go.mod h1:uQTKmIqs+rAYaq+DFaoD2X7pcjLOqbQX2AOiO0nIpb8= k8s.io/client-go v0.28.3 h1:2OqNb72ZuTZPKCl+4gTKvqao0AMOl9f3o2ijbAj3LI4= k8s.io/client-go v0.28.3/go.mod h1:LTykbBp9gsA7SwqirlCXBWtK0guzfhpoW4qSm7i9dxo= -k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= -k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 h1:02WBxjyRwX4rJdl3XlWVjFbXT/kAKCsipoM8hQY3Dwo= -k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2/go.mod h1:B7Huvd1LKZtTYmY+nC6rnmN8lyGYT9lifBcPD5epL6k= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/kube-scheduler v0.28.3 h1:sCvDOzRSDGCZ4whVykNoh/HbAZbwBMhbJ9xFab4QUCI= k8s.io/kube-scheduler v0.28.3/go.mod h1:bZ0V8rlDE2eoLl2At4mSdGBKe9k6cA9P0+AuJ6aG+Os= k8s.io/kubelet v0.28.3 h1:bp/uIf1R5F61BlFvFtzc4PDEiK7TtFcw3wFJlc0V0LM= k8s.io/kubelet v0.28.3/go.mod h1:E3NHYbp/v45Ao6AD0EOZnqO3L0R6Haks6Nm0+bnFwtU= -k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= -k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/mount-utils v0.31.1 h1:f8UrH9kRynljmdNGM6BaCvFUON5ZPKDgE+ltmYqI4wA= +k8s.io/mount-utils v0.31.1/go.mod h1:HV/VYBUGqYUj4vt82YltzpWvgv8FPg0G9ItyInT3NPU= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.16.3 h1:2TuvuokmfXvDUamSx1SuAOO3eTyye+47mJCigwG62c4= sigs.k8s.io/controller-runtime v0.16.3/go.mod h1:j7bialYoSn142nv9sCOJmQgDXQXxnroFU4VnX/brVJ0= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/node-feature-discovery v0.15.4 h1:IoSN/G+Bl94Liu+b862a3gx/rqCKdeUtcPxbL4VnOYg= +sigs.k8s.io/node-feature-discovery v0.15.4/go.mod h1:vp165AxVdzCWYIKuaLkckGo53/D5OR+WSyePSUEIYQw= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -tags.cncf.io/container-device-interface v0.7.1 h1:MATNCbAD1su9U6zwQe5BrQ2vGGp1GBayD70bYaxYCNE= -tags.cncf.io/container-device-interface v0.7.1/go.mod h1:h1JVuOqTQVORp8DziaWKUCDNzAmN+zeCbqbqD30D0ZQ= -tags.cncf.io/container-device-interface/specs-go v0.7.0 h1:w/maMGVeLP6TIQJVYT5pbqTi8SCw/iHZ+n4ignuGHqg= -tags.cncf.io/container-device-interface/specs-go v0.7.0/go.mod h1:hMAwAbMZyBLdmYqWgYcKH0F/yctNpV3P35f+/088A80= +tags.cncf.io/container-device-interface v0.8.0 h1:8bCFo/g9WODjWx3m6EYl3GfUG31eKJbaggyBDxEldRc= +tags.cncf.io/container-device-interface v0.8.0/go.mod h1:Apb7N4VdILW0EVdEMRYXIDVRZfNJZ+kmEUss2kRRQ6Y= +tags.cncf.io/container-device-interface/specs-go v0.8.0 h1:QYGFzGxvYK/ZLMrjhvY0RjpUavIn4KcmRmVP/JjdBTA= +tags.cncf.io/container-device-interface/specs-go v0.8.0/go.mod h1:BhJIkjjPh4qpys+qm4DAYtUyryaTDg9zris+AczXyws= diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go deleted file mode 100644 index 93c818b0e..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go +++ /dev/null @@ -1,41 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package cdi - -// Interface provides the API to the 'cdi' package -// -//go:generate moq -stub -out api_mock.go . Interface -type Interface interface { - CreateSpecFile() error - QualifiedName(string, string) string -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/cdi.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/cdi.go deleted file mode 100644 index 45103b4a6..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/cdi.go +++ /dev/null @@ -1,193 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package cdi - -import ( - "fmt" - "path/filepath" - - nvdevice "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvlib/pkg/nvml" - "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" - roottransform "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform/root" - "github.com/sirupsen/logrus" - cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" -) - -const ( - cdiRoot = "/var/run/cdi" -) - -// cdiHandler creates CDI specs for devices assocatied with the device plugin. -type cdiHandler struct { - logger *logrus.Logger - nvml nvml.Interface - nvdevice nvdevice.Interface - driverRoot string - targetDriverRoot string - nvidiaCTKPath string - cdiRoot string - vendor string - deviceIDStrategy string - - enabled bool - gdsEnabled bool - mofedEnabled bool - - cdilibs map[string]nvcdi.Interface -} - -var _ Interface = &cdiHandler{} - -// newHandler constructs a new instance of the 'cdi' interface. -func newHandler(opts ...Option) (Interface, error) { - c := &cdiHandler{} - for _, opt := range opts { - opt(c) - } - - if !c.enabled { - return &null{}, nil - } - - if c.logger == nil { - c.logger = logrus.StandardLogger() - } - if c.nvml == nil { - c.nvml = nvml.New() - } - if c.nvdevice == nil { - c.nvdevice = nvdevice.New(nvdevice.WithNvml(c.nvml)) - } - if c.deviceIDStrategy == "" { - c.deviceIDStrategy = "uuid" - } - if c.driverRoot == "" { - c.driverRoot = "/" - } - if c.targetDriverRoot == "" { - c.targetDriverRoot = c.driverRoot - } - - deviceNamer, err := nvcdi.NewDeviceNamer(c.deviceIDStrategy) - if err != nil { - return nil, err - } - - c.cdilibs = make(map[string]nvcdi.Interface) - - c.cdilibs["gpu"], err = nvcdi.New( - nvcdi.WithLogger(c.logger), - nvcdi.WithNvmlLib(c.nvml), - nvcdi.WithDeviceLib(c.nvdevice), - nvcdi.WithNVIDIACTKPath(c.nvidiaCTKPath), - nvcdi.WithDriverRoot(c.driverRoot), - nvcdi.WithDeviceNamers(deviceNamer), - nvcdi.WithVendor(c.vendor), - nvcdi.WithClass("gpu"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create nvcdi library: %v", err) - } - - var additionalModes []string - if c.gdsEnabled { - additionalModes = append(additionalModes, "gds") - } - if c.mofedEnabled { - additionalModes = append(additionalModes, "mofed") - } - - for _, mode := range additionalModes { - lib, err := nvcdi.New( - nvcdi.WithLogger(c.logger), - nvcdi.WithNVIDIACTKPath(c.nvidiaCTKPath), - nvcdi.WithDriverRoot(c.driverRoot), - nvcdi.WithVendor(c.vendor), - nvcdi.WithMode(mode), - ) - if err != nil { - return nil, fmt.Errorf("failed to create nvcdi library: %v", err) - } - c.cdilibs[mode] = lib - } - - return c, nil -} - -// CreateSpecFile creates a CDI spec file for the specified devices. -func (cdi *cdiHandler) CreateSpecFile() error { - for class, cdilib := range cdi.cdilibs { - cdi.logger.Infof("Generating CDI spec for resource: %s/%s", cdi.vendor, class) - - if class == "gpu" { - ret := cdi.nvml.Init() - if ret != nvml.SUCCESS { - return fmt.Errorf("failed to initialize NVML: %v", ret) - } - defer cdi.nvml.Shutdown() - } - - spec, err := cdilib.GetSpec() - if err != nil { - return fmt.Errorf("failed to get CDI spec: %v", err) - } - - err = roottransform.New( - roottransform.WithRoot(cdi.driverRoot), - roottransform.WithTargetRoot(cdi.targetDriverRoot), - ).Transform(spec.Raw()) - if err != nil { - return fmt.Errorf("failed to transform driver root in CDI spec: %v", err) - } - - raw := spec.Raw() - specName, err := cdiapi.GenerateNameForSpec(raw) - if err != nil { - return fmt.Errorf("failed to generate spec name: %v", err) - } - - err = spec.Save(filepath.Join(cdiRoot, specName+".json")) - if err != nil { - return fmt.Errorf("failed to save CDI spec: %v", err) - } - } - - return nil -} - -// QualifiedName constructs a CDI qualified device name for the specified resources. -// Note: This assumes that the specified id matches the device name returned by the naming strategy. -func (cdi *cdiHandler) QualifiedName(class string, id string) string { - return cdiapi.QualifiedName(cdi.vendor, class, id) -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go deleted file mode 100644 index 01173c62a..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go +++ /dev/null @@ -1,52 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package cdi - -import ( - "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - - "k8s.io/klog/v2" -) - -// New is a factory method that creates a CDI handler for creating CDI specs. -func New(opts ...Option) (Interface, error) { - infolib := info.New() - - hasNVML, _ := infolib.HasNvml() - if !hasNVML { - klog.Warning("No valid resources detected, creating a null CDI handler") - return NewNullHandler(), nil - } - - return newHandler(opts...) -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go deleted file mode 100644 index e5a46c73c..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go +++ /dev/null @@ -1,59 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package cdi - -import ( - "k8s.io/klog/v2" -) - -type null struct{} - -var _ Interface = &null{} - -// NewNullHandler returns an instance of the 'cdi' interface that can -// be used when CDI specs are not required. -func NewNullHandler() Interface { - return &null{} -} - -// CreateSpecFile is a no-op for the null handler. -func (n *null) CreateSpecFile() error { - return nil -} - -// QualifiedName is a no-op for the null handler. A error message is logged -// inidicating this should never be called for the null handler. -func (n *null) QualifiedName(class string, id string) string { - klog.Error("cannot return a qualified CDI device name with the null CDI handler") - return "" -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go deleted file mode 100644 index 77bb69f4a..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go +++ /dev/null @@ -1,103 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package cdi - -import ( - "github.com/NVIDIA/go-nvlib/pkg/nvml" -) - -// Option defines a function for passing options to the New() call -type Option func(*cdiHandler) - -// WithEnabled provides an Option to set the enabled flag used by the 'cdi' interface -func WithEnabled(enabled bool) Option { - return func(c *cdiHandler) { - c.enabled = enabled - } -} - -// WithDriverRoot provides an Option to set the driver root used by the 'cdi' interface -func WithDriverRoot(root string) Option { - return func(c *cdiHandler) { - c.driverRoot = root - } -} - -// WithTargetDriverRoot provides an Option to set the target driver root used by the 'cdi' interface -func WithTargetDriverRoot(root string) Option { - return func(c *cdiHandler) { - c.targetDriverRoot = root - } -} - -// WithNvidiaCTKPath provides an Option to set the nvidia-ctk path used by the 'cdi' interface -func WithNvidiaCTKPath(path string) Option { - return func(c *cdiHandler) { - c.nvidiaCTKPath = path - } -} - -// WithNvml provides an Option to set the NVML library used by the 'cdi' interface -func WithNvml(nvml nvml.Interface) Option { - return func(c *cdiHandler) { - c.nvml = nvml - } -} - -// WithDeviceIDStrategy provides an Option to set the device ID strategy used by the 'cdi' interface -func WithDeviceIDStrategy(strategy string) Option { - return func(c *cdiHandler) { - c.deviceIDStrategy = strategy - } -} - -// WithVendor provides an Option to set the vendor used by the 'cdi' interface -func WithVendor(vendor string) Option { - return func(c *cdiHandler) { - c.vendor = vendor - } -} - -// WithGdsEnabled provides and option to set whether a GDS CDI spec should be generated -func WithGdsEnabled(enabled bool) Option { - return func(c *cdiHandler) { - c.gdsEnabled = enabled - } -} - -// WithMofedEnabled provides and option to set whether a MOFED CDI spec should be generated -func WithMofedEnabled(enabled bool) Option { - return func(c *cdiHandler) { - c.mofedEnabled = enabled - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go b/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go deleted file mode 100644 index 503b5ded8..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go +++ /dev/null @@ -1,64 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package info - -import "strings" - -// version must be set by go build's -X main.version= option in the Makefile. -var version = "unknown" - -// gitCommit will be the hash that the binary was built from -// and will be populated by the Makefile. -var gitCommit = "" - -// GetVersionParts returns the different version components. -func GetVersionParts() []string { - v := []string{version} - - if gitCommit != "" { - v = append(v, "commit: "+gitCommit) - } - - return v -} - -// GetVersionString returns the string representation of the version. -func GetVersionString(more ...string) string { - v := append(GetVersionParts(), more...) - return strings.Join(v, "\n") -} - -// GetVersion returns the version of the binary. -func GetVersion() string { - return version -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go deleted file mode 100644 index 1066c035a..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go +++ /dev/null @@ -1,42 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package plugin - -import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" - -// Interface defines the API for the plugin package -type Interface interface { - Devices() rm.Devices - Start() error - Stop() error -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go deleted file mode 100644 index 59f7e9c71..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go +++ /dev/null @@ -1,41 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package manager - -import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" - -// Interface defines the API for the plugin manager package -type Interface interface { - GetPlugins() ([]plugin.Interface, error) - CreateCDISpecFile() error -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/factory.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/factory.go deleted file mode 100644 index 5286fe699..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/factory.go +++ /dev/null @@ -1,152 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package manager - -import ( - "fmt" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - "github.com/NVIDIA/go-nvlib/pkg/nvml" - "k8s.io/klog/v2" - - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" -) - -type manager struct { - migStrategy string - failOnInitError bool - nvmllib nvml.Interface - - cdiHandler cdi.Interface - cdiEnabled bool - config *nvidia.DeviceConfig - infolib info.Interface -} - -// New creates a new plugin manager with the supplied options. -func New(opts ...Option) (Interface, error) { - m := &manager{} - for _, opt := range opts { - opt(m) - } - - if m.config == nil { - klog.Warning("no config provided, returning a null manager") - return &null{}, nil - } - - if m.infolib == nil { - m.infolib = info.New() - } - if m.cdiHandler == nil { - m.cdiHandler = cdi.NewNullHandler() - } - - mode, err := m.resolveMode() - if err != nil { - return nil, err - } - - if mode != "nvml" && m.cdiEnabled { - klog.Warning("CDI is not supported; disabling CDI.") - m.cdiEnabled = false - } - - switch mode { - case "nvml": - if m.nvmllib == nil { - m.nvmllib = nvml.New() - } - ret := m.nvmllib.Init() - if ret != nvml.SUCCESS { - klog.Errorf("Failed to initialize NVML: %v.", ret) - klog.Errorf("If this is a GPU node, did you set the docker default runtime to `nvidia`?") - klog.Errorf("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") - klog.Errorf("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") - klog.Errorf("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") - if m.failOnInitError { - return nil, fmt.Errorf("nvml init failed: %v", ret) - } - klog.Warningf("nvml init failed: %v", ret) - return &null{}, nil - } - defer m.nvmllib.Shutdown() - - return (*nvmlmanager)(m), nil - case "tegra": - return (*tegramanager)(m), nil - case "null": - return &null{}, nil - } - - return nil, fmt.Errorf("unknown mode: %v", mode) -} - -func (m *manager) resolveMode() (string, error) { - // logWithReason logs the output of the has* / is* checks from the info.Interface - logWithReason := func(f func() (bool, string), tag string) bool { - is, reason := f() - if !is { - tag = "non-" + tag - } - klog.Infof("Detected %v platform: %v", tag, reason) - return is - } - - hasNVML := logWithReason(m.infolib.HasNvml, "NVML") - isTegra := logWithReason(m.infolib.IsTegraSystem, "Tegra") - - if !hasNVML && !isTegra { - klog.Error("Incompatible platform detected") - klog.Error("If this is a GPU node, did you configure the NVIDIA Container Toolkit?") - klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") - klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") - klog.Error("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") - if m.failOnInitError { - return "", fmt.Errorf("platform detection failed") - } - return "null", nil - } - - // The NVIDIA container stack does not yet support the use of integrated AND discrete GPUs on the same node. - if isTegra { - if hasNVML { - klog.Warning("Disabling Tegra-based resources on NVML system") - return "nvml", nil - } - return "tegra", nil - } - - return "nvml", nil -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go deleted file mode 100644 index 8cfcae63a..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go +++ /dev/null @@ -1,49 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package manager - -import ( - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" -) - -type null struct{} - -// GetPlugins returns an empty set of Plugins for the null manager -func (m *null) GetPlugins() ([]plugin.Interface, error) { - return nil, nil -} - -// CreateCDISpecFile creates the spec is a no-op for the null plugin -func (m *null) CreateCDISpecFile() error { - return nil -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go deleted file mode 100644 index 35abe3abe..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go +++ /dev/null @@ -1,61 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package manager - -import ( - "fmt" - - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" -) - -type nvmlmanager manager - -// GetPlugins returns the plugins associated with the NVML resources available on the node -func (m *nvmlmanager) GetPlugins() ([]plugin.Interface, error) { - rms, err := rm.NewNVMLResourceManagers(m.nvmllib, m.config) - if err != nil { - return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) - } - - var plugins []plugin.Interface - for _, r := range rms { - plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled)) - } - return plugins, nil -} - -// CreateCDISpecFile creates forwards the request to the CDI handler -func (m *nvmlmanager) CreateCDISpecFile() error { - return m.cdiHandler.CreateSpecFile() -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go deleted file mode 100644 index 44e87beed..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go +++ /dev/null @@ -1,84 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package manager - -import ( - "github.com/NVIDIA/go-nvlib/pkg/nvml" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" -) - -// Option is a function that configures a manager -type Option func(*manager) - -// WithCDIEnabled sets whether CDI is enabled for the manager -func WithCDIEnabled(enabled bool) Option { - return func(m *manager) { - m.cdiEnabled = enabled - } -} - -// WithCDIHandler sets the CDI handler for the manager -func WithCDIHandler(handler cdi.Interface) Option { - return func(m *manager) { - m.cdiHandler = handler - } -} - -// WithNVML sets the NVML handler for the manager -func WithNVML(nvmllib nvml.Interface) Option { - return func(m *manager) { - m.nvmllib = nvmllib - } -} - -// WithFailOnInitError sets whether the manager should fail on initialization errors -func WithFailOnInitError(failOnInitError bool) Option { - return func(m *manager) { - m.failOnInitError = failOnInitError - } -} - -// WithMigStrategy sets the MIG strategy for the manager -func WithMigStrategy(migStrategy string) Option { - return func(m *manager) { - m.migStrategy = migStrategy - } -} - -// WithConfig sets the config reference for the manager -func WithConfig(config *nvidia.DeviceConfig) Option { - return func(m *manager) { - m.config = config - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go deleted file mode 100644 index 8c1801e26..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go +++ /dev/null @@ -1,61 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package manager - -import ( - "fmt" - - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" -) - -type tegramanager manager - -// GetPlugins returns the plugins associated with the NVML resources available on the node -func (m *tegramanager) GetPlugins() ([]plugin.Interface, error) { - rms, err := rm.NewTegraResourceManagers(m.config) - if err != nil { - return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) - } - - var plugins []plugin.Interface - for _, r := range rms { - plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled)) - } - return plugins, nil -} - -// CreateCDISpecFile creates the spec is a no-op for the tegra plugin -func (m *tegramanager) CreateCDISpecFile() error { - return nil -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go deleted file mode 100644 index 087e7cf6c..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go +++ /dev/null @@ -1,671 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package plugin - -import ( - "bytes" - "encoding/json" - "errors" - "fmt" - "net" - "os" - "os/exec" - "path" - "path/filepath" - "strconv" - "strings" - "time" - - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" - cdiapi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" - "github.com/google/uuid" - "golang.org/x/net/context" - "google.golang.org/grpc" - "k8s.io/apimachinery/pkg/util/yaml" - "k8s.io/klog/v2" - kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - - "github.com/Project-HAMi/HAMi/pkg/device" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - "github.com/Project-HAMi/HAMi/pkg/util" -) - -// Constants for use by the 'volume-mounts' device list strategy -const ( - deviceListAsVolumeMountsHostPath = "/dev/null" - deviceListAsVolumeMountsContainerPathRoot = "/var/run/nvidia-container-devices" - NodeLockNvidia = "hami.io/mutex.lock" -) - -var ( - hostHookPath string - ConfigFile *string -) - -func init() { - hostHookPath, _ = os.LookupEnv("HOOK_PATH") -} - -// NvidiaDevicePlugin implements the Kubernetes device plugin API -type NvidiaDevicePlugin struct { - rm rm.ResourceManager - config *nvidia.DeviceConfig - deviceListEnvvar string - deviceListStrategies spec.DeviceListStrategies - socket string - schedulerConfig nvidia.NvidiaConfig - - cdiHandler cdi.Interface - cdiEnabled bool - cdiAnnotationPrefix string - - operatingMode string - migCurrent nvidia.MigPartedSpec - - server *grpc.Server - health chan *rm.Device - stop chan interface{} -} - -func readFromConfigFile(sConfig *nvidia.NvidiaConfig) (string, error) { - jsonbyte, err := os.ReadFile("/config/config.json") - mode := "hami-core" - if err != nil { - return "", err - } - var deviceConfigs nvidia.DevicePluginConfigs - err = json.Unmarshal(jsonbyte, &deviceConfigs) - if err != nil { - return "", err - } - klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs)) - for _, val := range deviceConfigs.Nodeconfig { - if os.Getenv(util.NodeNameEnvName) == val.Name { - klog.Infof("Reading config from file %s", val.Name) - if val.Devicememoryscaling > 0 { - sConfig.DeviceMemoryScaling = val.Devicememoryscaling - } - if val.Devicecorescaling > 0 { - sConfig.DeviceCoreScaling = val.Devicecorescaling - } - if val.Devicesplitcount > 0 { - sConfig.DeviceSplitCount = val.Devicesplitcount - } - if val.FilterDevice != nil && (len(val.FilterDevice.UUID) > 0 || len(val.FilterDevice.Index) > 0) { - nvidia.DevicePluginFilterDevice = val.FilterDevice - } - if len(val.OperatingMode) > 0 { - mode = val.OperatingMode - } - klog.Infof("FilterDevice: %v", val.FilterDevice) - } - } - return mode, nil -} - -// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin -func NewNvidiaDevicePlugin(config *nvidia.DeviceConfig, resourceManager rm.ResourceManager, cdiHandler cdi.Interface, cdiEnabled bool) *NvidiaDevicePlugin { - _, name := resourceManager.Resource().Split() - - deviceListStrategies, _ := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) - - sConfig, err := device.LoadConfig(*ConfigFile) - klog.Infoln("reading config=", config, "resourceName", config.ResourceName, "configfile=", *ConfigFile, "sconfig=", sConfig) - if err != nil { - klog.Fatalf(`failed to load device config file %s: %v`, *ConfigFile, err) - } - mode, err := readFromConfigFile(&sConfig.NvidiaConfig) - if err != nil { - klog.Errorf("readFromConfigFile err:%s", err.Error()) - } - // Initialize devices with configuration - if err := device.InitDevicesWithConfig(sConfig); err != nil { - klog.Fatalf("failed to initialize devices: %v", err) - } - return &NvidiaDevicePlugin{ - rm: resourceManager, - config: config, - deviceListEnvvar: "NVIDIA_VISIBLE_DEVICES", - deviceListStrategies: deviceListStrategies, - socket: kubeletdevicepluginv1beta1.DevicePluginPath + "nvidia-" + name + ".sock", - cdiHandler: cdiHandler, - cdiEnabled: cdiEnabled, - cdiAnnotationPrefix: *config.Flags.Plugin.CDIAnnotationPrefix, - schedulerConfig: sConfig.NvidiaConfig, - operatingMode: mode, - migCurrent: nvidia.MigPartedSpec{}, - - // These will be reinitialized every - // time the plugin server is restarted. - server: nil, - health: nil, - stop: nil, - } -} - -func (plugin *NvidiaDevicePlugin) initialize() { - plugin.server = grpc.NewServer([]grpc.ServerOption{}...) - plugin.health = make(chan *rm.Device) - plugin.stop = make(chan interface{}) -} - -func (plugin *NvidiaDevicePlugin) cleanup() { - close(plugin.stop) - plugin.server = nil - plugin.health = nil - plugin.stop = nil -} - -// Devices returns the full set of devices associated with the plugin. -func (plugin *NvidiaDevicePlugin) Devices() rm.Devices { - return plugin.rm.Devices() -} - -// Start starts the gRPC server, registers the device plugin with the Kubelet, -// and starts the device healthchecks. -func (plugin *NvidiaDevicePlugin) Start() error { - plugin.initialize() - - err := plugin.Serve() - if err != nil { - klog.Infof("Could not start device plugin for '%s': %s", plugin.rm.Resource(), err) - plugin.cleanup() - return err - } - klog.Infof("Starting to serve '%s' on %s", plugin.rm.Resource(), plugin.socket) - - err = plugin.Register() - if err != nil { - klog.Infof("Could not register device plugin: %s", err) - plugin.Stop() - return err - } - klog.Infof("Registered device plugin for '%s' with Kubelet", plugin.rm.Resource()) - - if plugin.operatingMode == "mig" { - cmd := exec.Command("nvidia-mig-parted", "export") - var stdout, stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr - err := cmd.Run() - if err != nil { - klog.Fatalf("nvidia-mig-parted failed with %s\n", err) - } - outStr := stdout.Bytes() - yaml.Unmarshal(outStr, &plugin.migCurrent) - os.WriteFile("/tmp/migconfig.yaml", outStr, os.ModePerm) - if len(plugin.migCurrent.MigConfigs["current"]) == 1 && len(plugin.migCurrent.MigConfigs["current"][0].Devices) == 0 { - idx := 0 - plugin.migCurrent.MigConfigs["current"][0].Devices = make([]int32, 0) - for idx < GetDeviceNums() { - plugin.migCurrent.MigConfigs["current"][0].Devices = append(plugin.migCurrent.MigConfigs["current"][0].Devices, int32(idx)) - idx++ - } - } - klog.Infoln("Mig export", plugin.migCurrent) - } - go func() { - err := plugin.rm.CheckHealth(plugin.stop, plugin.health) - if err != nil { - klog.Infof("Failed to start health check: %v; continuing with health checks disabled", err) - } - }() - - go func() { - plugin.WatchAndRegister() - }() - - return nil -} - -// Stop stops the gRPC server. -func (plugin *NvidiaDevicePlugin) Stop() error { - if plugin == nil || plugin.server == nil { - return nil - } - klog.Infof("Stopping to serve '%s' on %s", plugin.rm.Resource(), plugin.socket) - plugin.server.Stop() - if err := os.Remove(plugin.socket); err != nil && !os.IsNotExist(err) { - return err - } - plugin.cleanup() - return nil -} - -// Serve starts the gRPC server of the device plugin. -func (plugin *NvidiaDevicePlugin) Serve() error { - os.Remove(plugin.socket) - sock, err := net.Listen("unix", plugin.socket) - if err != nil { - return err - } - - kubeletdevicepluginv1beta1.RegisterDevicePluginServer(plugin.server, plugin) - - go func() { - lastCrashTime := time.Now() - restartCount := 0 - for { - klog.Infof("Starting GRPC server for '%s'", plugin.rm.Resource()) - err := plugin.server.Serve(sock) - if err == nil { - break - } - - klog.Infof("GRPC server for '%s' crashed with error: %v", plugin.rm.Resource(), err) - - // restart if it has not been too often - // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time - if restartCount > 5 { - // quit - klog.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", plugin.rm.Resource()) - } - timeSinceLastCrash := time.Since(lastCrashTime).Seconds() - lastCrashTime = time.Now() - if timeSinceLastCrash > 3600 { - // it has been one hour since the last crash.. reset the count - // to reflect on the frequency - restartCount = 1 - } else { - restartCount++ - } - } - }() - - // Wait for server to start by launching a blocking connexion - conn, err := plugin.dial(plugin.socket, 5*time.Second) - if err != nil { - return err - } - conn.Close() - - return nil -} - -// Register registers the device plugin for the given resourceName with Kubelet. -func (plugin *NvidiaDevicePlugin) Register() error { - conn, err := plugin.dial(kubeletdevicepluginv1beta1.KubeletSocket, 5*time.Second) - if err != nil { - return err - } - defer conn.Close() - - client := kubeletdevicepluginv1beta1.NewRegistrationClient(conn) - reqt := &kubeletdevicepluginv1beta1.RegisterRequest{ - Version: kubeletdevicepluginv1beta1.Version, - Endpoint: path.Base(plugin.socket), - ResourceName: string(plugin.rm.Resource()), - Options: &kubeletdevicepluginv1beta1.DevicePluginOptions{ - GetPreferredAllocationAvailable: false, - }, - } - - _, err = client.Register(context.Background(), reqt) - if err != nil { - return err - } - return nil -} - -// GetDevicePluginOptions returns the values of the optional settings for this plugin -func (plugin *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *kubeletdevicepluginv1beta1.Empty) (*kubeletdevicepluginv1beta1.DevicePluginOptions, error) { - options := &kubeletdevicepluginv1beta1.DevicePluginOptions{ - GetPreferredAllocationAvailable: false, - } - return options, nil -} - -// ListAndWatch lists devices and update that list according to the health status -func (plugin *NvidiaDevicePlugin) ListAndWatch(e *kubeletdevicepluginv1beta1.Empty, s kubeletdevicepluginv1beta1.DevicePlugin_ListAndWatchServer) error { - s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices: plugin.apiDevices()}) - - for { - select { - case <-plugin.stop: - return nil - case d := <-plugin.health: - // FIXME: there is no way to recover from the Unhealthy state. - d.Health = kubeletdevicepluginv1beta1.Unhealthy - klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID) - s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices: plugin.apiDevices()}) - } - } -} - -// GetPreferredAllocation returns the preferred allocation from the set of devices specified in the request -func (plugin *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r *kubeletdevicepluginv1beta1.PreferredAllocationRequest) (*kubeletdevicepluginv1beta1.PreferredAllocationResponse, error) { - response := &kubeletdevicepluginv1beta1.PreferredAllocationResponse{} - /*for _, req := range r.ContainerRequests { - devices, err := plugin.rm.GetPreferredAllocation(req.AvailableDeviceIDs, req.MustIncludeDeviceIDs, int(req.AllocationSize)) - if err != nil { - return nil, fmt.Errorf("error getting list of preferred allocation devices: %v", err) - } - - resp := &kubeletdevicepluginv1beta1.ContainerPreferredAllocationResponse{ - DeviceIDs: devices, - } - - response.ContainerResponses = append(response.ContainerResponses, resp) - }*/ - return response, nil -} - -// Allocate which return list of devices. -func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.AllocateRequest) (*kubeletdevicepluginv1beta1.AllocateResponse, error) { - klog.InfoS("Allocate", "request", reqs) - responses := kubeletdevicepluginv1beta1.AllocateResponse{} - nodename := os.Getenv(util.NodeNameEnvName) - current, err := util.GetPendingPod(ctx, nodename) - if err != nil { - //nodelock.ReleaseNodeLock(nodename, NodeLockNvidia, current) - return &kubeletdevicepluginv1beta1.AllocateResponse{}, err - } - klog.Infof("Allocate pod name is %s/%s, annotation is %+v", current.Namespace, current.Name, current.Annotations) - - for idx, req := range reqs.ContainerRequests { - // If the devices being allocated are replicas, then (conditionally) - // error out if more than one resource is being allocated. - - if strings.Contains(req.DevicesIDs[0], "MIG") { - if plugin.config.Sharing.TimeSlicing.FailRequestsGreaterThanOne && rm.AnnotatedIDs(req.DevicesIDs).AnyHasAnnotations() { - if len(req.DevicesIDs) > 1 { - device.PodAllocationFailed(nodename, current, NodeLockNvidia) - return nil, fmt.Errorf("request for '%v: %v' too large: maximum request size for shared resources is 1", plugin.rm.Resource(), len(req.DevicesIDs)) - } - } - - for _, id := range req.DevicesIDs { - if !plugin.rm.Devices().Contains(id) { - device.PodAllocationFailed(nodename, current, NodeLockNvidia) - return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", plugin.rm.Resource(), id) - } - } - - response, err := plugin.getAllocateResponse(req.DevicesIDs) - if err != nil { - device.PodAllocationFailed(nodename, current, NodeLockNvidia) - return nil, fmt.Errorf("failed to get allocate response: %v", err) - } - responses.ContainerResponses = append(responses.ContainerResponses, response) - } else { - currentCtr, devreq, err := GetNextDeviceRequest(nvidia.NvidiaGPUDevice, *current) - klog.Infoln("deviceAllocateFromAnnotation=", devreq) - if err != nil { - device.PodAllocationFailed(nodename, current, NodeLockNvidia) - return &kubeletdevicepluginv1beta1.AllocateResponse{}, err - } - if len(devreq) != len(reqs.ContainerRequests[idx].DevicesIDs) { - device.PodAllocationFailed(nodename, current, NodeLockNvidia) - return &kubeletdevicepluginv1beta1.AllocateResponse{}, errors.New("device number not matched") - } - response, err := plugin.getAllocateResponse(plugin.GetContainerDeviceStrArray(devreq)) - if err != nil { - return nil, fmt.Errorf("failed to get allocate response: %v", err) - } - - err = EraseNextDeviceTypeFromAnnotation(nvidia.NvidiaGPUDevice, *current) - if err != nil { - device.PodAllocationFailed(nodename, current, NodeLockNvidia) - return &kubeletdevicepluginv1beta1.AllocateResponse{}, err - } - - if plugin.operatingMode != "mig" { - for i, dev := range devreq { - limitKey := fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v", i) - response.Envs[limitKey] = fmt.Sprintf("%vm", dev.Usedmem) - } - response.Envs["CUDA_DEVICE_SM_LIMIT"] = fmt.Sprint(devreq[0].Usedcores) - response.Envs["CUDA_DEVICE_MEMORY_SHARED_CACHE"] = fmt.Sprintf("%s/vgpu/%v.cache", hostHookPath, uuid.New().String()) - if plugin.schedulerConfig.DeviceMemoryScaling > 1 { - response.Envs["CUDA_OVERSUBSCRIBE"] = "true" - } - if plugin.schedulerConfig.DisableCoreLimit { - response.Envs[util.CoreLimitSwitch] = "disable" - } - cacheFileHostDirectory := fmt.Sprintf("%s/vgpu/containers/%s_%s", hostHookPath, current.UID, currentCtr.Name) - os.RemoveAll(cacheFileHostDirectory) - - os.MkdirAll(cacheFileHostDirectory, 0777) - os.Chmod(cacheFileHostDirectory, 0777) - os.MkdirAll("/tmp/vgpulock", 0777) - os.Chmod("/tmp/vgpulock", 0777) - response.Mounts = append(response.Mounts, - &kubeletdevicepluginv1beta1.Mount{ContainerPath: fmt.Sprintf("%s/vgpu/libvgpu.so", hostHookPath), - HostPath: GetLibPath(), - ReadOnly: true}, - &kubeletdevicepluginv1beta1.Mount{ContainerPath: fmt.Sprintf("%s/vgpu", hostHookPath), - HostPath: cacheFileHostDirectory, - ReadOnly: false}, - &kubeletdevicepluginv1beta1.Mount{ContainerPath: "/tmp/vgpulock", - HostPath: "/tmp/vgpulock", - ReadOnly: false}, - ) - found := false - for _, val := range currentCtr.Env { - if strings.Compare(val.Name, "CUDA_DISABLE_CONTROL") == 0 { - // if env existed but is set to false or can not be parsed, ignore - t, _ := strconv.ParseBool(val.Value) - if !t { - continue - } - // only env existed and set to true, we mark it "found" - found = true - break - } - } - if !found { - response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ContainerPath: "/etc/ld.so.preload", - HostPath: hostHookPath + "/vgpu/ld.so.preload", - ReadOnly: true}, - ) - } - _, err = os.Stat(fmt.Sprintf("%s/vgpu/license", hostHookPath)) - if err == nil { - response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ - ContainerPath: "/tmp/license", - HostPath: fmt.Sprintf("%s/vgpu/license", hostHookPath), - ReadOnly: true, - }) - response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ - ContainerPath: "/usr/bin/vgpuvalidator", - HostPath: fmt.Sprintf("%s/vgpu/vgpuvalidator", hostHookPath), - ReadOnly: true, - }) - } - } - responses.ContainerResponses = append(responses.ContainerResponses, response) - } - } - klog.Infoln("Allocate Response", responses.ContainerResponses) - device.PodAllocationTrySuccess(nodename, nvidia.NvidiaGPUDevice, NodeLockNvidia, current) - return &responses, nil -} - -func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*kubeletdevicepluginv1beta1.ContainerAllocateResponse, error) { - deviceIDs := plugin.deviceIDsFromAnnotatedDeviceIDs(requestIds) - - responseID := uuid.New().String() - response, err := plugin.getAllocateResponseForCDI(responseID, deviceIDs) - if err != nil { - return nil, fmt.Errorf("failed to get allocate response for CDI: %v", err) - } - - response.Envs = plugin.apiEnvs(plugin.deviceListEnvvar, deviceIDs) - //if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyVolumeMounts) || plugin.deviceListStrategies.Includes(spec.DeviceListStrategyEnvvar) { - // response.Envs = plugin.apiEnvs(plugin.deviceListEnvvar, deviceIDs) - //} - /* - if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyVolumeMounts) { - response.Envs = plugin.apiEnvs(plugin.deviceListEnvvar, []string{deviceListAsVolumeMountsContainerPathRoot}) - response.Mounts = plugin.apiMounts(deviceIDs) - }*/ - if *plugin.config.Flags.Plugin.PassDeviceSpecs { - response.Devices = plugin.apiDeviceSpecs(*plugin.config.Flags.NvidiaDriverRoot, requestIds) - } - if *plugin.config.Flags.GDSEnabled { - response.Envs["NVIDIA_GDS"] = "enabled" - } - if *plugin.config.Flags.MOFEDEnabled { - response.Envs["NVIDIA_MOFED"] = "enabled" - } - - return &response, nil -} - -// getAllocateResponseForCDI returns the allocate response for the specified device IDs. -// This response contains the annotations required to trigger CDI injection in the container engine or nvidia-container-runtime. -func (plugin *NvidiaDevicePlugin) getAllocateResponseForCDI(responseID string, deviceIDs []string) (kubeletdevicepluginv1beta1.ContainerAllocateResponse, error) { - response := kubeletdevicepluginv1beta1.ContainerAllocateResponse{} - - if !plugin.cdiEnabled { - return response, nil - } - - var devices []string - for _, id := range deviceIDs { - devices = append(devices, plugin.cdiHandler.QualifiedName("gpu", id)) - } - - if *plugin.config.Flags.GDSEnabled { - devices = append(devices, plugin.cdiHandler.QualifiedName("gds", "all")) - } - if *plugin.config.Flags.MOFEDEnabled { - devices = append(devices, plugin.cdiHandler.QualifiedName("mofed", "all")) - } - - if len(devices) == 0 { - return response, nil - } - - if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyCDIAnnotations) { - annotations, err := plugin.getCDIDeviceAnnotations(responseID, devices) - if err != nil { - return response, err - } - response.Annotations = annotations - } - - return response, nil -} - -func (plugin *NvidiaDevicePlugin) getCDIDeviceAnnotations(id string, devices []string) (map[string]string, error) { - annotations, err := cdiapi.UpdateAnnotations(map[string]string{}, "nvidia-device-plugin", id, devices) - if err != nil { - return nil, fmt.Errorf("failed to add CDI annotations: %v", err) - } - - if plugin.cdiAnnotationPrefix == spec.DefaultCDIAnnotationPrefix { - return annotations, nil - } - - // update annotations if a custom CDI prefix is configured - updatedAnnotations := make(map[string]string) - for k, v := range annotations { - newKey := plugin.cdiAnnotationPrefix + strings.TrimPrefix(k, spec.DefaultCDIAnnotationPrefix) - updatedAnnotations[newKey] = v - } - - return updatedAnnotations, nil -} - -// PreStartContainer is unimplemented for this plugin -func (plugin *NvidiaDevicePlugin) PreStartContainer(context.Context, *kubeletdevicepluginv1beta1.PreStartContainerRequest) (*kubeletdevicepluginv1beta1.PreStartContainerResponse, error) { - return &kubeletdevicepluginv1beta1.PreStartContainerResponse{}, nil -} - -// dial establishes the gRPC communication with the registered device plugin. -func (plugin *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { - c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), - grpc.WithTimeout(timeout), - grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { - return net.DialTimeout("unix", addr, timeout) - }), - ) - - if err != nil { - return nil, err - } - - return c, nil -} - -func (plugin *NvidiaDevicePlugin) deviceIDsFromAnnotatedDeviceIDs(ids []string) []string { - var deviceIDs []string - if *plugin.config.Flags.Plugin.DeviceIDStrategy == spec.DeviceIDStrategyUUID { - deviceIDs = rm.AnnotatedIDs(ids).GetIDs() - } - if *plugin.config.Flags.Plugin.DeviceIDStrategy == spec.DeviceIDStrategyIndex { - deviceIDs = plugin.rm.Devices().Subset(ids).GetIndices() - } - return deviceIDs -} - -func (plugin *NvidiaDevicePlugin) apiDevices() []*kubeletdevicepluginv1beta1.Device { - return plugin.rm.Devices().GetPluginDevices(plugin.schedulerConfig.DeviceSplitCount) -} - -func (plugin *NvidiaDevicePlugin) apiEnvs(envvar string, deviceIDs []string) map[string]string { - return map[string]string{ - envvar: strings.Join(deviceIDs, ","), - } -} - -func (plugin *NvidiaDevicePlugin) apiDeviceSpecs(driverRoot string, ids []string) []*kubeletdevicepluginv1beta1.DeviceSpec { - optional := map[string]bool{ - "/dev/nvidiactl": true, - "/dev/nvidia-uvm": true, - "/dev/nvidia-uvm-tools": true, - "/dev/nvidia-modeset": true, - } - - paths := plugin.rm.GetDevicePaths(ids) - - var specs []*kubeletdevicepluginv1beta1.DeviceSpec - for _, p := range paths { - if optional[p] { - if _, err := os.Stat(p); err != nil { - continue - } - } - spec := &kubeletdevicepluginv1beta1.DeviceSpec{ - ContainerPath: p, - HostPath: filepath.Join(driverRoot, p), - Permissions: "rw", - } - specs = append(specs, spec) - } - - return specs -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/util_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/util_test.go deleted file mode 100644 index 28a8cdebe..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/util_test.go +++ /dev/null @@ -1,156 +0,0 @@ -/** -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package plugin - -import ( - "testing" - - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - "github.com/Project-HAMi/HAMi/pkg/util" -) - -func TestGenerateMigTemplate(t *testing.T) { - sconfig := nvidia.NvidiaConfig{ - MigGeometriesList: []util.AllowedMigGeometries{ - { - Models: []string{"A30"}, - Geometries: []util.Geometry{ - {util.MigTemplate{Name: "1g.6gb", Memory: 6144, Count: 4}}, - {util.MigTemplate{Name: "2g.12gb", Memory: 12288, Count: 2}}, - {util.MigTemplate{Name: "4g.24gb", Memory: 24576, Count: 1}}, - }, - }, - { - Models: []string{"A100-SXM4-40GB", "A100-40GB-PCIe", "A100-PCIE-40GB", "A100-SXM4-40GB"}, - Geometries: []util.Geometry{ - {util.MigTemplate{Name: "1g.5gb", Memory: 5120, Count: 7}}, - {util.MigTemplate{Name: "2g.10gb", Memory: 10240, Count: 3}}, - {util.MigTemplate{Name: "1g.5gb", Memory: 5120, Count: 1}}, - {util.MigTemplate{Name: "3g.20gb", Memory: 20480, Count: 2}}, - {util.MigTemplate{Name: "7g.40gb", Memory: 40960, Count: 1}}, - }, - }, - { - Models: []string{"A100-SXM4-80GB", "A100-80GB-PCIe", "A100-PCIE-80GB"}, - Geometries: []util.Geometry{ - {util.MigTemplate{Name: "1g.10gb", Memory: 10240, Count: 7}}, - {util.MigTemplate{Name: "2g.20gb", Memory: 20480, Count: 3}}, - {util.MigTemplate{Name: "1g.10gb", Memory: 10240, Count: 1}}, - {util.MigTemplate{Name: "3g.40gb", Memory: 40960, Count: 2}}, - {util.MigTemplate{Name: "7g.80gb", Memory: 81920, Count: 1}}, - }, - }, - }, - } - - plugin := NvidiaDevicePlugin{ - operatingMode: "mig", - schedulerConfig: sconfig, - } - plugin.migCurrent = nvidia.MigPartedSpec{ - Version: "v1", - MigConfigs: make(map[string]nvidia.MigConfigSpecSlice), - } - plugin.migCurrent.MigConfigs["current"] = nvidia.MigConfigSpecSlice{ - nvidia.MigConfigSpec{ - Devices: []int32{0, 1}, - MigEnabled: true, - MigDevices: make(map[string]int32), // Ensure this map is initialized - }, - } - - testCases := []struct { - name string - model string - deviceIdx int - containerDev util.ContainerDevice - expectedPos int - expectedReset bool - expectedMig map[string]int32 - }{ - { - name: "2g.10gb template", - model: "A100-SXM4-40GB", - deviceIdx: 0, - containerDev: util.ContainerDevice{ - Idx: 0, - UUID: "aaaaabbbb[1-1]", - Usedmem: 3000, - }, - expectedPos: 1, - expectedReset: true, - expectedMig: map[string]int32{ - "2g.10gb": 3, - }, - }, - { - name: "1g.5gb template", - model: "A100-SXM4-40GB", - deviceIdx: 0, - containerDev: util.ContainerDevice{ - Idx: 0, - UUID: "aaaaabbbb[0-1]", - Usedmem: 3000, - }, - expectedPos: 1, - expectedReset: true, - expectedMig: map[string]int32{ - "1g.5gb": 7, - }, - }, - { - name: "no reset needed", - model: "A100-SXM4-40GB", - deviceIdx: 0, - containerDev: util.ContainerDevice{ - Idx: 0, - UUID: "aaaaabbbb[0-2]", - Usedmem: 3000, - }, - expectedPos: 2, - expectedReset: false, - expectedMig: map[string]int32{ - "1g.5gb": 8, - }, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - pos, needsreset := plugin.GenerateMigTemplate(tc.model, tc.deviceIdx, tc.containerDev) - - // Check if the position matches the expected value - if pos != tc.expectedPos { - t.Errorf("expected position %d, got %d", tc.expectedPos, pos) - } - - // Check if the reset flag matches the expected value - if needsreset != tc.expectedReset { - t.Errorf("expected reset %v, got %v", tc.expectedReset, needsreset) - } - - // Check if the mig devices match the expected values - migDevices := plugin.migCurrent.MigConfigs["current"][0].MigDevices - for k, v := range tc.expectedMig { - actual, ok := migDevices[k] - if !ok || actual != v { - t.Errorf("expected %s count %d, got %d", k, v, actual) - } - } - }) - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/allocate.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/allocate.go deleted file mode 100644 index d83cfb2ee..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/allocate.go +++ /dev/null @@ -1,137 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -import ( - "fmt" - "sort" - - "github.com/NVIDIA/go-gpuallocator/gpuallocator" -) - -var alignedAllocationPolicy = gpuallocator.NewBestEffortPolicy() - -// getPreferredAllocation runs an allocation algorithm over the inputs. -// The algorithm chosen is based both on the incoming set of available devices and various config settings. -func (r *resourceManager) getPreferredAllocation(available, required []string, size int) ([]string, error) { - // If all of the available devices are full GPUs without replicas, then - // calculate an aligned allocation across those devices. - if r.Devices().AlignedAllocationSupported() && !AnnotatedIDs(available).AnyHasAnnotations() { - return r.alignedAlloc(available, required, size) - } - - // Otherwise, distribute them evenly across all replicated GPUs - return r.distributedAlloc(available, required, size) -} - -// alignedAlloc shells out to the alignedAllocationPolicy that is set in -// order to calculate the preferred allocation. -func (r *resourceManager) alignedAlloc(available, required []string, size int) ([]string, error) { - var devices []string - - availableDevices, err := gpuallocator.NewDevicesFrom(available) - if err != nil { - return nil, fmt.Errorf("unable to retrieve list of available devices: %v", err) - } - - requiredDevices, err := gpuallocator.NewDevicesFrom(required) - if err != nil { - return nil, fmt.Errorf("unable to retrieve list of required devices: %v", err) - } - - allocatedDevices := alignedAllocationPolicy.Allocate(availableDevices, requiredDevices, size) - - for _, device := range allocatedDevices { - devices = append(devices, device.UUID) - } - - return devices, nil -} - -// distributedAlloc returns a list of devices such that any replicated -// devices are distributed across all replicated GPUs equally. It takes into -// account already allocated replicas to ensure a proper balance across them. -func (r *resourceManager) distributedAlloc(available, required []string, size int) ([]string, error) { - // Get the set of candidate devices as the difference between available and required. - candidates := r.devices.Subset(available).Difference(r.devices.Subset(required)).GetIDs() - needed := size - len(required) - - if len(candidates) < needed { - return nil, fmt.Errorf("not enough available devices to satisfy allocation") - } - - // For each candidate device, build a mapping of (stripped) device ID to - // total / available replicas for that device. - replicas := make(map[string]*struct{ total, available int }) - for _, c := range candidates { - id := AnnotatedID(c).GetID() - if _, exists := replicas[id]; !exists { - replicas[id] = &struct{ total, available int }{} - } - replicas[id].available++ - } - for d := range r.devices { - id := AnnotatedID(d).GetID() - if _, exists := replicas[id]; !exists { - continue - } - replicas[id].total++ - } - - // Grab the set of 'needed' devices one-by-one from the candidates list. - // Before selecting each candidate, first sort the candidate list using the - // replicas map above. After sorting, the first element in the list will - // contain the device with the least difference between total and available - // replications (based on what's already been allocated). Add this device - // to the list of devices to allocate, remove it from the candidate list, - // down its available count in the replicas map, and repeat. - var devices []string - for i := 0; i < needed; i++ { - sort.Slice(candidates, func(i, j int) bool { - iid := AnnotatedID(candidates[i]).GetID() - jid := AnnotatedID(candidates[j]).GetID() - idiff := replicas[iid].total - replicas[iid].available - jdiff := replicas[jid].total - replicas[jid].available - return idiff < jdiff - }) - id := AnnotatedID(candidates[0]).GetID() - replicas[id].available-- - devices = append(devices, candidates[0]) - candidates = candidates[1:] - } - - // Add the set of required devices to this list and return it. - devices = append(required, devices...) - - return devices, nil -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go deleted file mode 100644 index c65147443..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go +++ /dev/null @@ -1,583 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -import ( - "fmt" - "testing" - - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" - "github.com/stretchr/testify/require" - kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" -) - -func TestDeviceMapInsert(t *testing.T) { - device0 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}} - device0withIndex := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}, Index: "index"} - device1 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "1"}} - - testCases := []struct { - description string - deviceMap DeviceMap - key string - value *Device - expectedDeviceMap DeviceMap - }{ - { - description: "insert into empty map", - deviceMap: make(DeviceMap), - key: "resource", - value: &device0, - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - }, - { - description: "add to existing resource", - deviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - key: "resource", - value: &device1, - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - "1": &device1, - }, - }, - }, - { - description: "add new resource", - deviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - key: "resource1", - value: &device0, - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - "resource1": Devices{ - "0": &device0, - }, - }, - }, - { - description: "overwrite existing device", - deviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - key: "resource", - value: &device0withIndex, - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0withIndex, - }, - }, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - tc.deviceMap.insert(spec.ResourceName(tc.key), tc.value) - - require.EqualValues(t, tc.expectedDeviceMap, tc.deviceMap) - }) - } -} - -func TestUpdateDeviceMapWithReplicas(t *testing.T) { - device0 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}, Index: "0"} - device1 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "1"}} - device2 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "2"}} - device3 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "3"}} - - testCases := []struct { - description string - config *nvidia.DeviceConfig - devices DeviceMap - expectedDeviceMap DeviceMap - }{ - { - description: "Update device map with replicas", - config: &nvidia.DeviceConfig{ - Config: &spec.Config{ - Sharing: spec.Sharing{ - TimeSlicing: spec.ReplicatedResources{ - Resources: []spec.ReplicatedResource{ - { - Name: "resource1", - Replicas: 2, - Rename: "replicated-resource1", - Devices: spec.ReplicatedDevices{ - All: true, - }, - }, - { - Name: "resource2", - Replicas: 1, - Devices: spec.ReplicatedDevices{ - All: true, - }, - }, - }, - }, - }, - }, - }, - devices: DeviceMap{ - "resource1": Devices{ - "0": &device0, - "1": &device1, - }, - "resource2": Devices{ - "2": &device2, - }, - "resource3": Devices{ - "3": &device3, - }, - }, - expectedDeviceMap: DeviceMap{ - "replicated-resource1": Devices{ - "0::0": &Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0::0"}, Index: "0"}, - "0::1": &Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0::1"}, Index: "0"}, - "1::0": &Device{Device: kubeletdevicepluginv1beta1.Device{ID: "1::0"}}, - "1::1": &Device{Device: kubeletdevicepluginv1beta1.Device{ID: "1::1"}}, - }, - "resource2": Devices{ - "2::0": &Device{Device: kubeletdevicepluginv1beta1.Device{ID: "2::0"}}, - }, - "resource3": Devices{ - "3": &device3, - }, - }, - }, - { - description: "Some devices are not replicated", - config: &nvidia.DeviceConfig{ - Config: &spec.Config{ - Sharing: spec.Sharing{ - TimeSlicing: spec.ReplicatedResources{ - Resources: []spec.ReplicatedResource{ - { - Name: "resource1", - Replicas: 2, - Rename: "replicated-resource1", - Devices: spec.ReplicatedDevices{ - List: []spec.ReplicatedDeviceRef{"0"}, // only replicate index 0 - }, - }, - }, - }, - }, - }, - }, - devices: DeviceMap{ - "resource1": Devices{ - "0": &device0, - "1": &device1, - }, - }, - expectedDeviceMap: DeviceMap{ - "replicated-resource1": Devices{ - "0::0": &Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0::0"}, Index: "0"}, - "0::1": &Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0::1"}, Index: "0"}, - }, - "resource1": Devices{ - "1": &device1, - }, - }, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - devices, _ := updateDeviceMapWithReplicas(tc.config, tc.devices) - require.EqualValues(t, tc.expectedDeviceMap, devices) - }) - } -} - -func TestDeviceMapMerge(t *testing.T) { - device0 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}} - device1 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "1"}} - device2 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "2"}} - device0Updated := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}, Index: "updated"} - - testCases := []struct { - description string - deviceMap DeviceMap - otherDeviceMap DeviceMap - expectedDeviceMap DeviceMap - }{ - { - description: "merge into empty map", - deviceMap: make(DeviceMap), - otherDeviceMap: DeviceMap{"resource": Devices{"0": &device0}}, - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - }, - { - description: "merge from empty map", - deviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - otherDeviceMap: make(DeviceMap), - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - }, - { - description: "merge with overlapping keys", - deviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - otherDeviceMap: DeviceMap{ - "resource": Devices{ - "1": &device1, - }, - }, - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - "1": &device1, - }, - }, - }, - { - description: "merge with device ID conflict (overwrite existing device)", - deviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - otherDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0Updated, - }, - }, - expectedDeviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0Updated, - }, - }, - }, - { - description: "merge with new resource", - deviceMap: DeviceMap{ - "resource1": Devices{ - "0": &device0, - }, - }, - otherDeviceMap: DeviceMap{ - "resource2": Devices{ - "1": &device1, - }, - }, - expectedDeviceMap: DeviceMap{ - "resource1": Devices{ - "0": &device0, - }, - "resource2": Devices{ - "1": &device1, - }, - }, - }, - { - description: "merge with multiple devices and resources", - deviceMap: DeviceMap{ - "resource1": Devices{ - "0": &device0, - }, - }, - otherDeviceMap: DeviceMap{ - "resource1": Devices{ - "1": &device1, - }, - "resource2": Devices{ - "2": &device2, - }, - }, - expectedDeviceMap: DeviceMap{ - "resource1": Devices{ - "0": &device0, - "1": &device1, - }, - "resource2": Devices{ - "2": &device2, - }, - }, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - tc.deviceMap.merge(tc.otherDeviceMap) - - require.EqualValues(t, tc.expectedDeviceMap, tc.deviceMap) - }) - } -} - -func TestDeviceMapIsEmpty(t *testing.T) { - device0 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}} - - testCases := []struct { - description string - deviceMap DeviceMap - expected bool - }{ - { - description: "empty map", - deviceMap: make(DeviceMap), - expected: true, - }, - { - description: "map with empty resource", - deviceMap: DeviceMap{ - "resource": Devices{}, - }, - expected: true, - }, - { - description: "map with non-empty resource", - deviceMap: DeviceMap{ - "resource": Devices{ - "0": &device0, - }, - }, - expected: false, - }, - { - description: "map with multiple empty resources", - deviceMap: DeviceMap{ - "resource1": Devices{}, - "resource2": Devices{}, - }, - expected: true, - }, - { - description: "map with multiple resources, one non-empty", - deviceMap: DeviceMap{ - "resource1": Devices{}, - "resource2": Devices{ - "0": &device0, - }, - }, - expected: false, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - actual := tc.deviceMap.isEmpty() - - require.Equal(t, tc.expected, actual) - }) - } -} - -func TestDeviceMapGetIDsOfDevicesToReplicate(t *testing.T) { - device0 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}, Index: "0"} - device1 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "1"}, Index: "1"} - device2 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "2"}, Index: "2"} - device3 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "3"}, Index: "3"} - - deviceMap := DeviceMap{ - "resource1": Devices{ - "0": &device0, - "1": &device1, - "2": &device2, - "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76": &device3, - }, - } - - testCases := []struct { - description string - deviceMap DeviceMap - resource *spec.ReplicatedResource - expectedIDs []string - expectedErr error - }{ - { - description: "resource does not exist", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "nonexistent_resource", - Devices: spec.ReplicatedDevices{}, - }, - expectedIDs: nil, - expectedErr: nil, - }, - { - description: "replicate all devices", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{ - All: true, - }, - }, - expectedIDs: []string{"0", "1", "2", "3"}, - expectedErr: nil, - }, - { - description: "replicate specific count of devices (count exceeds available)", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{ - Count: 5, - }, - }, - expectedIDs: nil, - expectedErr: fmt.Errorf("requested 5 devices to be replicated, but only 4 devices available"), - }, - { - description: "replicate specific devices by ID (valid)", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{ - List: []spec.ReplicatedDeviceRef{ - spec.ReplicatedDeviceRef("GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76"), // ref UUID - }, - }, - }, - expectedIDs: []string{"3"}, - expectedErr: nil, - }, - { - description: "replicate specific devices by ID (invalid ID)", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{ - List: []spec.ReplicatedDeviceRef{ - spec.ReplicatedDeviceRef("GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b77"), // Nonexistent device - }, - }, - }, - expectedIDs: nil, - expectedErr: fmt.Errorf("no matching device with UUID: GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b77"), - }, - { - description: "replicate specific devices by GPU index (valid)", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{ - List: []spec.ReplicatedDeviceRef{ - spec.ReplicatedDeviceRef("0"), // Index: "0" - spec.ReplicatedDeviceRef("1"), // Index: "1" - }, - }, - }, - expectedIDs: []string{"0", "1"}, - expectedErr: nil, - }, - { - description: "replicate specific devices by GPU index (invalid)", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{ - List: []spec.ReplicatedDeviceRef{ - spec.ReplicatedDeviceRef("0"), // Index: "0" - spec.ReplicatedDeviceRef("4"), // Nonexistent Index - }, - }, - }, - expectedIDs: nil, - expectedErr: fmt.Errorf("no matching device at index: 4"), - }, - { - description: "invalid replicated devices", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{ - List: []spec.ReplicatedDeviceRef{ - spec.ReplicatedDeviceRef("invalid_index"), // Invalid gpu - }, - }, - }, - expectedIDs: nil, - expectedErr: nil, - }, - { - description: "unexpected error (no replication criteria provided)", - deviceMap: deviceMap, - resource: &spec.ReplicatedResource{ - Name: "resource1", - Devices: spec.ReplicatedDevices{}, - }, - expectedIDs: nil, - expectedErr: fmt.Errorf("unexpected error"), - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - ids, err := tc.deviceMap.getIDsOfDevicesToReplicate(tc.resource) - - if tc.expectedErr != nil { - require.Error(t, err) - require.EqualError(t, err, tc.expectedErr.Error()) - } else { - require.NoError(t, err) - } - - require.ElementsMatch(t, tc.expectedIDs, ids) - }) - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go deleted file mode 100644 index 5818839f9..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go +++ /dev/null @@ -1,100 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestGetAdditionalXids(t *testing.T) { - testCases := []struct { - description string - input string - expected []uint64 - }{ - { - description: "Empty input", - }, - { - description: "Only comma", - input: ",", - }, - { - description: "Non-integer input", - input: "not-an-int", - }, - { - description: "Single integer", - input: "68", - expected: []uint64{68}, - }, - { - description: "Negative integer", - input: "-68", - }, - { - description: "Single integer with trailing spaces", - input: "68 ", - expected: []uint64{68}, - }, - { - description: "Single integer followed by comma without trailing number", - input: "68,", - expected: []uint64{68}, - }, - { - description: "Comma without preceding number followed by single integer", - input: ",68", - expected: []uint64{68}, - }, - { - description: "Two comma-separated integers", - input: "68,67", - expected: []uint64{68, 67}, - }, - { - description: "Two integers separated by non-integer", - input: "68,not-an-int,67", - expected: []uint64{68, 67}, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - xids := getAdditionalXids(tc.input) - require.EqualValues(t, tc.expected, xids) - }) - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go deleted file mode 100644 index 4c3ff3c65..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go +++ /dev/null @@ -1,54 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -// int8Slice wraps an []int8 with more functions. -type int8Slice []int8 - -// String turns a nil terminated int8Slice into a string -func (s int8Slice) String() string { - var b []byte - for _, c := range s { - if c == 0 { - break - } - b = append(b, byte(c)) - } - return string(b) -} - -// uintPtr returns a *uint from a uint32 -func uintPtr(c uint32) *uint { - i := uint(c) - return &i -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices_test.go deleted file mode 100644 index 921af2f6d..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices_test.go +++ /dev/null @@ -1,179 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -import ( - "fmt" - "testing" - - "github.com/NVIDIA/go-nvlib/pkg/nvml" - //"github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/rm" - "github.com/stretchr/testify/require" -) - -// Test GetUUID for nvmlDevice -func TestNvmlDevice_GetUUID(t *testing.T) { - testCases := []struct { - description string - nvmlDevice nvml.Device - expectedUUID string - expectedError error - }{ - { - description: "Successful UUID retrieval", - nvmlDevice: &nvml.DeviceMock{ - GetUUIDFunc: func() (string, nvml.Return) { - return "GPU-12345", nvml.SUCCESS - }, - }, - expectedUUID: "GPU-12345", - expectedError: nil, - }, - { - description: "Error retrieving UUID", - nvmlDevice: &nvml.DeviceMock{ - GetUUIDFunc: func() (string, nvml.Return) { - return "GPU-12345", nvml.ERROR_UNKNOWN - }, - }, - expectedUUID: "", - expectedError: nvml.ERROR_UNKNOWN, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - device := nvmlDevice{Device: tc.nvmlDevice} - uuid, err := device.GetUUID() - - if tc.expectedError == nil { - require.NoError(t, err) - } else { - require.EqualError(t, err, tc.expectedError.Error()) - } - require.Equal(t, tc.expectedUUID, uuid) - }) - } -} - -func TestNvmlDevice_GetPaths(t *testing.T) { - testCases := []struct { - description string - nvmlDevice nvml.Device - expectedPaths []string - expectedError error - }{ - { - description: "Successful path retrieval", - nvmlDevice: &nvml.DeviceMock{ - GetMinorNumberFunc: func() (int, nvml.Return) { - return 0, nvml.SUCCESS - }, - }, - expectedPaths: []string{"/dev/nvidia0"}, - expectedError: nil, - }, - { - description: "Error retrieving UUID", - nvmlDevice: &nvml.DeviceMock{ - GetMinorNumberFunc: func() (int, nvml.Return) { - return 0, nvml.ERROR_UNKNOWN - }, - }, - expectedPaths: nil, - expectedError: fmt.Errorf("error getting GPU device minor number: %v", nvml.ERROR_UNKNOWN), - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - device := nvmlDevice{Device: tc.nvmlDevice} - paths, err := device.GetPaths() - - if tc.expectedError == nil { - require.NoError(t, err) - } else { - require.Contains(t, err.Error(), nvml.ERROR_UNKNOWN.Error()) - } - require.Equal(t, tc.expectedPaths, paths) - }) - } -} - -func TestNvmlDevice_GetNumaNode(t *testing.T) { - testCases := []struct { - description string - nvmlDevice nvml.Device - expectedHasNode bool - expectedNode int - expectedError error - }{ - { - description: "No NUMA node", - nvmlDevice: &nvml.DeviceMock{ - GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { - return nvml.PciInfo{BusId: [32]int8{'0', '0', '0', '0', ':', '0', '2', ':', '0', '0', '.', '0', 0, 0, 0, 0}}, nvml.SUCCESS - }, - }, - expectedHasNode: false, - expectedNode: 0, - expectedError: nil, - }, - { - description: "Error getting PCI info", - nvmlDevice: &nvml.DeviceMock{ - GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { - return nvml.PciInfo{}, nvml.ERROR_UNKNOWN - }, - }, - expectedHasNode: false, - expectedNode: 0, - expectedError: nvml.ERROR_UNKNOWN, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - device := nvmlDevice{Device: tc.nvmlDevice} - hasNode, node, err := device.GetNumaNode() - - if tc.expectedError == nil { - require.NoError(t, err) - } else { - require.Contains(t, err.Error(), tc.expectedError.Error()) - } - require.Equal(t, tc.expectedHasNode, hasNode) - require.Equal(t, tc.expectedNode, node) - }) - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go deleted file mode 100644 index ee5048348..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go +++ /dev/null @@ -1,120 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -import ( - "fmt" - - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - - "github.com/NVIDIA/go-nvlib/pkg/nvml" - "k8s.io/klog/v2" -) - -type nvmlResourceManager struct { - resourceManager - nvml nvml.Interface -} - -var _ ResourceManager = (*nvmlResourceManager)(nil) - -// NewNVMLResourceManagers returns a set of ResourceManagers, one for each NVML resource in 'config'. -func NewNVMLResourceManagers(nvmllib nvml.Interface, config *nvidia.DeviceConfig) ([]ResourceManager, error) { - ret := nvmllib.Init() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("failed to initialize NVML: %v", ret) - } - defer func() { - ret := nvmllib.Shutdown() - if ret != nvml.SUCCESS { - klog.Infof("Error shutting down NVML: %v", ret) - } - }() - - deviceMap, err := NewDeviceMap(nvmllib, config) - if err != nil { - return nil, fmt.Errorf("error building device map: %v", err) - } - - var rms []ResourceManager - for resourceName, devices := range deviceMap { - if len(devices) == 0 { - continue - } - for key, value := range devices { - if nvidia.FilterDeviceToRegister(value.ID, value.Index) { - klog.V(5).InfoS("Filtering device", "device", value.ID) - delete(devices, key) - continue - } - } - r := &nvmlResourceManager{ - resourceManager: resourceManager{ - config: config, - resource: resourceName, - devices: devices, - }, - nvml: nvmllib, - } - rms = append(rms, r) - } - - return rms, nil -} - -// GetPreferredAllocation runs an allocation algorithm over the inputs. -// The algorithm chosen is based both on the incoming set of available devices and various config settings. -func (r *nvmlResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) { - return r.getPreferredAllocation(available, required, size) -} - -// GetDevicePaths returns the required and optional device nodes for the requested resources -func (r *nvmlResourceManager) GetDevicePaths(ids []string) []string { - paths := []string{ - "/dev/nvidiactl", - "/dev/nvidia-uvm", - "/dev/nvidia-uvm-tools", - "/dev/nvidia-modeset", - } - - for _, p := range r.Devices().Subset(ids).GetPaths() { - paths = append(paths, p) - } - - return paths -} - -// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices -func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { - return r.checkHealth(stop, r.devices, unhealthy) -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/rm.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/rm.go deleted file mode 100644 index 0c30bbf54..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/rm.go +++ /dev/null @@ -1,176 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -import ( - "fmt" - "strings" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - "github.com/NVIDIA/go-nvlib/pkg/nvml" - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - "k8s.io/klog/v2" -) - -// resourceManager forms the base type for specific resource manager implementations -type resourceManager struct { - config *nvidia.DeviceConfig - resource spec.ResourceName - devices Devices -} - -// ResourceManager provides an interface for listing a set of Devices and checking health on them -type ResourceManager interface { - Resource() spec.ResourceName - Devices() Devices - GetDevicePaths([]string) []string - GetPreferredAllocation(available, required []string, size int) ([]string, error) - CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error -} - -// NewResourceManagers returns a []ResourceManager, one for each resource in 'config'. -func NewResourceManagers(nvmllib nvml.Interface, config *nvidia.DeviceConfig) ([]ResourceManager, error) { - // logWithReason logs the output of the has* / is* checks from the info.Interface - logWithReason := func(f func() (bool, string), tag string) bool { - is, reason := f() - if !is { - tag = "non-" + tag - } - klog.Infof("Detected %v platform: %v", tag, reason) - return is - } - - infolib := info.New() - - hasNVML := logWithReason(infolib.HasNvml, "NVML") - isTegra := logWithReason(infolib.IsTegraSystem, "Tegra") - - if !hasNVML && !isTegra { - klog.Error("Incompatible platform detected") - klog.Error("If this is a GPU node, did you configure the NVIDIA Container Toolkit?") - klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") - klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") - klog.Error("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") - if *config.Flags.FailOnInitError { - return nil, fmt.Errorf("platform detection failed") - } - return nil, nil - } - - // The NVIDIA container stack does not yet support the use of integrated AND discrete GPUs on the same node. - if hasNVML && isTegra { - klog.Warning("Disabling Tegra-based resources on NVML system") - isTegra = false - } - - var resourceManagers []ResourceManager - - if hasNVML { - nvmlManagers, err := NewNVMLResourceManagers(nvmllib, config) - if err != nil { - return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) - } - resourceManagers = append(resourceManagers, nvmlManagers...) - } - - if isTegra { - tegraManagers, err := NewTegraResourceManagers(config) - if err != nil { - return nil, fmt.Errorf("failed to construct Tegra resource managers: %v", err) - } - resourceManagers = append(resourceManagers, tegraManagers...) - } - - return resourceManagers, nil -} - -// Resource gets the resource name associated with the ResourceManager -func (r *resourceManager) Resource() spec.ResourceName { - return r.resource -} - -// Resource gets the devices managed by the ResourceManager -func (r *resourceManager) Devices() Devices { - return r.devices -} - -// AddDefaultResourcesToConfig adds default resource matching rules to config.Resources -func AddDefaultResourcesToConfig(config *nvidia.DeviceConfig) error { - //config.Resources.AddGPUResource("*", "gpu") - config.Resources.GPUs = append(config.Resources.GPUs, spec.Resource{ - Pattern: "*", - Name: spec.ResourceName(*config.ResourceName), - }) - fmt.Println("config=", config.Resources.GPUs) - switch *config.Flags.MigStrategy { - case spec.MigStrategySingle: - return config.Resources.AddMIGResource("*", "gpu") - case spec.MigStrategyMixed: - hasNVML, reason := info.New().HasNvml() - if !hasNVML { - klog.Warningf("mig-strategy=%q is only supported with NVML", spec.MigStrategyMixed) - klog.Warningf("NVML not detected: %v", reason) - return nil - } - - nvmllib := nvml.New() - ret := nvmllib.Init() - if ret != nvml.SUCCESS { - if *config.Flags.FailOnInitError { - return fmt.Errorf("failed to initialize NVML: %v", ret) - } - return nil - } - defer func() { - ret := nvmllib.Shutdown() - if ret != nvml.SUCCESS { - klog.Errorf("Error shutting down NVML: %v", ret) - } - }() - - devicelib := device.New( - device.WithNvml(nvmllib), - ) - return devicelib.VisitMigProfiles(func(p device.MigProfile) error { - profileInfo := p.GetInfo() - if profileInfo.C != profileInfo.G { - return nil - } - resourceName := strings.ReplaceAll("mig-"+p.String(), "+", ".") - return config.Resources.AddMIGResource(p.String(), resourceName) - }) - } - return nil -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go deleted file mode 100644 index 7696a3038..000000000 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go +++ /dev/null @@ -1,52 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ - -package rm - -type wslDevice nvmlDevice - -var _ deviceInfo = (*wslDevice)(nil) - -// GetUUID returns the UUID of the device -func (d wslDevice) GetUUID() (string, error) { - return nvmlDevice(d).GetUUID() -} - -// GetPaths returns the paths for a tegra device. -func (d wslDevice) GetPaths() ([]string, error) { - return []string{"/dev/dxg"}, nil -} - -// GetNumaNode returns the NUMA node associated with the GPU device -func (d wslDevice) GetNumaNode() (bool, int, error) { - return nvmlDevice(d).GetNumaNode() -} diff --git a/pkg/device/nvidia/device.go b/pkg/device/nvidia/device.go index 2d962e9b3..664399689 100644 --- a/pkg/device/nvidia/device.go +++ b/pkg/device/nvidia/device.go @@ -23,13 +23,13 @@ import ( "strconv" "strings" - "github.com/Project-HAMi/HAMi/pkg/util" - "github.com/Project-HAMi/HAMi/pkg/util/nodelock" - - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/klog/v2" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/HAMi/pkg/util/nodelock" ) const ( diff --git a/pkg/nvidia-plugin/api/config/v1/config.go b/pkg/nvidia-plugin/api/config/v1/config.go new file mode 100644 index 000000000..5672340bc --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/config.go @@ -0,0 +1,160 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + "fmt" + "io" + "os" + + cli "github.com/urfave/cli/v2" + + "sigs.k8s.io/yaml" +) + +// Version indicates the version of the 'Config' struct used to hold configuration information. +const Version = "v1" + +// Config is a versioned struct used to hold configuration information. +type Config struct { + Version string `json:"version" yaml:"version"` + Flags Flags `json:"flags,omitempty" yaml:"flags,omitempty"` + Resources Resources `json:"resources,omitempty" yaml:"resources,omitempty"` + Sharing Sharing `json:"sharing,omitempty" yaml:"sharing,omitempty"` + Imex Imex `json:"imex,omitempty" yaml:"imex,omitempty"` +} + +// NewConfig builds out a Config struct from a config file (or command line flags). +// The data stored in the config will be populated in order of precedence from +// (1) command line, (2) environment variable, (3) config file. +func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) { + config := &Config{Version: Version} + + if configFile := c.String("config-file"); configFile != "" { + var err error + config, err = parseConfig(configFile) + if err != nil { + return nil, fmt.Errorf("unable to parse config file: %v", err) + } + } + + config.Flags.UpdateFromCLIFlags(c, flags) + // TODO: This is currently not at the flags level? + // Does this mean that we should move UpdateFromCLIFlags to function off Config? + if c.IsSet("imex-channel-ids") { + config.Imex.ChannelIDs = c.IntSlice("imex-channel-ids") + } + if c.IsSet("imex-required") { + config.Imex.Required = c.Bool("imex-required") + } + + // If nvidiaDevRoot (the path to the device nodes on the host) is not set, + // we default to using the driver root on the host. + if config.Flags.NvidiaDevRoot == nil || *config.Flags.NvidiaDevRoot == "" { + config.Flags.NvidiaDevRoot = config.Flags.NvidiaDriverRoot + } + + // We explicitly set sharing.mps.failRequestsGreaterThanOne = true + // This can be relaxed in certain cases -- such as a single GPU -- but + // requires additional logic around when it's OK to combine requests and + // makes the semantics of a request unclear. + if config.Sharing.MPS != nil { + config.Sharing.MPS.FailRequestsGreaterThanOne = true + } + + return config, nil +} + +// logger is used to issue warning in API functions without requiring an explicit implementation. +type logger interface { + Warning(...interface{}) + Warningf(string, ...interface{}) +} + +// DisableResourceNamingInConfig temporarily disable the resource renaming feature of the plugin. +// This may be reenabled in a future release. +func DisableResourceNamingInConfig(logger logger, config *Config) { + // Disable resource renaming through config.Resource + if len(config.Resources.GPUs) > 0 || len(config.Resources.MIGs) > 0 { + logger.Warning("Customizing the 'resources' field is not yet supported in the config. Ignoring...") + } + config.Resources.GPUs = nil + config.Resources.MIGs = nil + + // Disable renaming / device selection in Sharing.TimeSlicing.Resources + config.Sharing.TimeSlicing.disableResoureRenaming(logger, "timeSlicing") + // Disable renaming / device selection in Sharing.MPS.Resources + config.Sharing.MPS.disableResoureRenaming(logger, "mps") +} + +// parseConfig parses a config file as either YAML of JSON and unmarshals it into a Config struct. +func parseConfig(configFile string) (*Config, error) { + reader, err := os.Open(configFile) + if err != nil { + return nil, fmt.Errorf("error opening config file: %v", err) + } + defer reader.Close() + + config, err := parseConfigFrom(reader) + if err != nil { + return nil, fmt.Errorf("error parsing config file: %v", err) + } + + return config, nil +} + +func parseConfigFrom(reader io.Reader) (*Config, error) { + var err error + var configYaml []byte + + configYaml, err = io.ReadAll(reader) + if err != nil { + return nil, fmt.Errorf("read error: %v", err) + } + + var config Config + err = yaml.Unmarshal(configYaml, &config) + if err != nil { + return nil, fmt.Errorf("unmarshal error: %v", err) + } + + if config.Version == "" { + config.Version = Version + } + + if config.Version != Version { + return nil, fmt.Errorf("unknown version: %v", config.Version) + } + + return &config, nil +} diff --git a/pkg/nvidia-plugin/api/config/v1/consts.go b/pkg/nvidia-plugin/api/config/v1/consts.go new file mode 100644 index 000000000..eaafe7258 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/consts.go @@ -0,0 +1,72 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" +) + +// Constants related to resource names +const ( + ResourceNamePrefix = "nvidia.com" + DefaultSharedResourceNameSuffix = ".shared" + MaxResourceNameLength = 63 +) + +// Constants representing the various MIG strategies +const ( + MigStrategyNone = "none" + MigStrategySingle = "single" + MigStrategyMixed = "mixed" +) + +// Constants to represent the various device list strategies +const ( + DeviceListStrategyEnvVar = "envvar" + DeviceListStrategyVolumeMounts = "volume-mounts" + DeviceListStrategyCDIAnnotations = "cdi-annotations" + DeviceListStrategyCDICRI = "cdi-cri" +) + +// Constants to represent the various device id strategies +const ( + DeviceIDStrategyUUID = "uuid" + DeviceIDStrategyIndex = "index" +) + +// Constants related to generating CDI specifications +const ( + DefaultCDIAnnotationPrefix = cdiapi.AnnotationPrefix + DefaultNvidiaCTKPath = "/usr/bin/nvidia-ctk" + DefaultContainerDriverRoot = "/driver-root" +) diff --git a/pkg/nvidia-plugin/api/config/v1/duration.go b/pkg/nvidia-plugin/api/config/v1/duration.go new file mode 100644 index 000000000..a3bd7f118 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/duration.go @@ -0,0 +1,69 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + "encoding/json" + "fmt" + "time" +) + +// Duration wraps a time.Duration function with custom JSON marshaling/unmarshaling +type Duration time.Duration + +// MarshalJSON marshals 'Duration' to its raw bytes representation +func (d Duration) MarshalJSON() ([]byte, error) { + return json.Marshal(time.Duration(d).String()) +} + +// UnmarshalJSON unmarshals raw bytes into a 'Duration' type. +func (d *Duration) UnmarshalJSON(b []byte) error { + var v interface{} + if err := json.Unmarshal(b, &v); err != nil { + return err + } + switch value := v.(type) { + case float64: + *d = Duration(time.Duration(value)) + return nil + case string: + tmp, err := time.ParseDuration(value) + if err != nil { + return err + } + *d = Duration(tmp) + return nil + default: + return fmt.Errorf("invalid duration") + } +} diff --git a/pkg/nvidia-plugin/api/config/v1/flags.go b/pkg/nvidia-plugin/api/config/v1/flags.go new file mode 100644 index 000000000..d26a96b78 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/flags.go @@ -0,0 +1,190 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + "encoding/json" + "fmt" + + cli "github.com/urfave/cli/v2" +) + +// prt returns a reference to whatever type is passed into it +func ptr[T any](x T) *T { + return &x +} + +// updateFromCLIFlag conditionally updates the config flag at 'pflag' to the value of the CLI flag with name 'flagName' +func updateFromCLIFlag[T any](pflag **T, c *cli.Context, flagName string) { + if c.IsSet(flagName) || *pflag == (*T)(nil) { + switch flag := any(pflag).(type) { + case **string: + *flag = ptr(c.String(flagName)) + case **[]string: + *flag = ptr(c.StringSlice(flagName)) + case **bool: + *flag = ptr(c.Bool(flagName)) + case **Duration: + *flag = ptr(Duration(c.Duration(flagName))) + case **deviceListStrategyFlag: + *flag = ptr((deviceListStrategyFlag)(c.StringSlice(flagName))) + default: + panic(fmt.Errorf("unsupported flag type for %v: %T", flagName, flag)) + } + } +} + +// Flags holds the full list of flags used to configure the device plugin and GFD. +type Flags struct { + CommandLineFlags +} + +// CommandLineFlags holds the list of command line flags used to configure the device plugin and GFD. +type CommandLineFlags struct { + MigStrategy *string `json:"migStrategy" yaml:"migStrategy"` + FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"` + MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"` + NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"` + NvidiaDevRoot *string `json:"nvidiaDevRoot,omitempty" yaml:"nvidiaDevRoot,omitempty"` + GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"` + MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"` + UseNodeFeatureAPI *bool `json:"useNodeFeatureAPI" yaml:"useNodeFeatureAPI"` + DeviceDiscoveryStrategy *string `json:"deviceDiscoveryStrategy" yaml:"deviceDiscoveryStrategy"` + Plugin *PluginCommandLineFlags `json:"plugin,omitempty" yaml:"plugin,omitempty"` + GFD *GFDCommandLineFlags `json:"gfd,omitempty" yaml:"gfd,omitempty"` +} + +// PluginCommandLineFlags holds the list of command line flags specific to the device plugin. +type PluginCommandLineFlags struct { + PassDeviceSpecs *bool `json:"passDeviceSpecs" yaml:"passDeviceSpecs"` + DeviceListStrategy *deviceListStrategyFlag `json:"deviceListStrategy" yaml:"deviceListStrategy"` + DeviceIDStrategy *string `json:"deviceIDStrategy" yaml:"deviceIDStrategy"` + CDIAnnotationPrefix *string `json:"cdiAnnotationPrefix" yaml:"cdiAnnotationPrefix"` + NvidiaCTKPath *string `json:"nvidiaCTKPath" yaml:"nvidiaCTKPath"` + ContainerDriverRoot *string `json:"containerDriverRoot" yaml:"containerDriverRoot"` +} + +// deviceListStrategyFlag is a custom type for parsing the deviceListStrategy flag. +type deviceListStrategyFlag []string + +// UnmarshalJSON implements the custom unmarshaler for the deviceListStrategyFlag type. +// Since this option allows a single string or a list of strings to be specified, +// we need to handle both cases. +func (f *deviceListStrategyFlag) UnmarshalJSON(b []byte) error { + var single string + err := json.Unmarshal(b, &single) + if err == nil { + *f = []string{single} + return nil + } + + var multi []string + if err := json.Unmarshal(b, &multi); err == nil { + *f = multi + return nil + } + + return fmt.Errorf("invalid deviceListStrategy: %v", string(b)) +} + +// GFDCommandLineFlags holds the list of command line flags specific to GFD. +type GFDCommandLineFlags struct { + Oneshot *bool `json:"oneshot" yaml:"oneshot"` + NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"` + SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"` + OutputFile *string `json:"outputFile" yaml:"outputFile"` + MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"` +} + +// UpdateFromCLIFlags updates Flags from settings in the cli Flags if they are set. +func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) { + for _, flag := range flags { + for _, n := range flag.Names() { + // Common flags + switch n { + case "mig-strategy": + updateFromCLIFlag(&f.MigStrategy, c, n) + case "fail-on-init-error": + updateFromCLIFlag(&f.FailOnInitError, c, n) + case "mps-root": + updateFromCLIFlag(&f.MpsRoot, c, n) + case "driver-root", "nvidia-driver-root": + updateFromCLIFlag(&f.NvidiaDriverRoot, c, n) + case "dev-root", "nvidia-dev-root": + updateFromCLIFlag(&f.NvidiaDevRoot, c, n) + case "gds-enabled": + updateFromCLIFlag(&f.GDSEnabled, c, n) + case "mofed-enabled": + updateFromCLIFlag(&f.MOFEDEnabled, c, n) + case "use-node-feature-api": + updateFromCLIFlag(&f.UseNodeFeatureAPI, c, n) + case "device-discovery-strategy": + updateFromCLIFlag(&f.DeviceDiscoveryStrategy, c, n) + } + // Plugin specific flags + if f.Plugin == nil { + f.Plugin = &PluginCommandLineFlags{} + } + switch n { + case "pass-device-specs": + updateFromCLIFlag(&f.Plugin.PassDeviceSpecs, c, n) + case "device-list-strategy": + updateFromCLIFlag(&f.Plugin.DeviceListStrategy, c, n) + case "device-id-strategy": + updateFromCLIFlag(&f.Plugin.DeviceIDStrategy, c, n) + case "cdi-annotation-prefix": + updateFromCLIFlag(&f.Plugin.CDIAnnotationPrefix, c, n) + case "nvidia-cdi-hook-path", "nvidia-ctk-path": + updateFromCLIFlag(&f.Plugin.NvidiaCTKPath, c, n) + case "container-driver-root": + updateFromCLIFlag(&f.Plugin.ContainerDriverRoot, c, n) + } + // GFD specific flags + if f.GFD == nil { + f.GFD = &GFDCommandLineFlags{} + } + switch n { + case "oneshot": + updateFromCLIFlag(&f.GFD.Oneshot, c, n) + case "output-file": + updateFromCLIFlag(&f.GFD.OutputFile, c, n) + case "sleep-interval": + updateFromCLIFlag(&f.GFD.SleepInterval, c, n) + case "no-timestamp": + updateFromCLIFlag(&f.GFD.NoTimestamp, c, n) + case "machine-type-file": + updateFromCLIFlag(&f.GFD.MachineTypeFile, c, n) + } + } + } +} diff --git a/pkg/nvidia-plugin/api/config/v1/flags_test.go b/pkg/nvidia-plugin/api/config/v1/flags_test.go new file mode 100644 index 000000000..8f4ac792c --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/flags_test.go @@ -0,0 +1,246 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + "encoding/json" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestUnmarshalFlags(t *testing.T) { + testCases := []struct { + input string + output Flags + err bool + }{ + { + input: ``, + err: true, + }, + { + input: `{}`, + output: Flags{}, + }, + { + input: `{ + "gfd": {} + }`, + output: Flags{ + CommandLineFlags{ + GFD: &GFDCommandLineFlags{}, + }, + }, + }, + { + input: `{ + "gfd": { + "sleepInterval": 0 + } + }`, + output: Flags{ + CommandLineFlags{ + GFD: &GFDCommandLineFlags{ + SleepInterval: ptr(Duration(0)), + }, + }, + }, + }, + { + input: `{ + "gfd": { + "sleepInterval": "0s" + } + }`, + output: Flags{ + CommandLineFlags{ + GFD: &GFDCommandLineFlags{ + SleepInterval: ptr(Duration(0)), + }, + }, + }, + }, + { + input: `{ + "gfd": { + "sleepInterval": 5 + } + }`, + output: Flags{ + CommandLineFlags{ + GFD: &GFDCommandLineFlags{ + SleepInterval: ptr(Duration(5)), + }, + }, + }, + }, + { + input: `{ + "gfd": { + "sleepInterval": "5s" + } + }`, + output: Flags{ + CommandLineFlags{ + GFD: &GFDCommandLineFlags{ + SleepInterval: ptr(Duration(5 * time.Second)), + }, + }, + }, + }, + { + input: `{ + "plugin": { + "deviceListStrategy": "envvar" + } + }`, + output: Flags{ + CommandLineFlags{ + Plugin: &PluginCommandLineFlags{ + DeviceListStrategy: &deviceListStrategyFlag{"envvar"}, + }, + }, + }, + }, + { + input: `{ + "plugin": { + "deviceListStrategy": ["envvar", "cdi-annotations"] + } + }`, + output: Flags{ + CommandLineFlags{ + Plugin: &PluginCommandLineFlags{ + DeviceListStrategy: &deviceListStrategyFlag{"envvar", "cdi-annotations"}, + }, + }, + }, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + var output Flags + err := json.Unmarshal([]byte(tc.input), &output) + if tc.err { + require.Error(t, err) + return + } + require.NoError(t, err) + require.Equal(t, tc.output, output) + }) + } +} + +func TestMarshalFlags(t *testing.T) { + testCases := []struct { + input Flags + output string + err bool + }{ + { + input: Flags{}, + output: `{ + "migStrategy": null, + "failOnInitError": null, + "gdsEnabled": null, + "mofedEnabled": null, + "useNodeFeatureAPI": null, + "deviceDiscoveryStrategy": null + }`, + }, + { + input: Flags{ + CommandLineFlags{ + GFD: &GFDCommandLineFlags{ + SleepInterval: ptr(Duration(0)), + }, + }, + }, + output: `{ + "migStrategy": null, + "failOnInitError": null, + "gdsEnabled": null, + "mofedEnabled": null, + "useNodeFeatureAPI": null, + "deviceDiscoveryStrategy": null, + "gfd": { + "oneshot": null, + "noTimestamp": null, + "outputFile": null, + "sleepInterval": "0s", + "machineTypeFile": null + } + }`, + }, + { + input: Flags{ + CommandLineFlags{ + GFD: &GFDCommandLineFlags{ + SleepInterval: ptr(Duration(5)), + }, + }, + }, + output: `{ + "migStrategy": null, + "failOnInitError": null, + "gdsEnabled": null, + "mofedEnabled": null, + "useNodeFeatureAPI": null, + "deviceDiscoveryStrategy": null, + "gfd": { + "oneshot": null, + "noTimestamp": null, + "outputFile": null, + "sleepInterval": "5ns", + "machineTypeFile": null + } + }`, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + output, err := json.Marshal(tc.input) + if tc.err { + require.Error(t, err) + return + } + require.NoError(t, err) + require.JSONEq(t, tc.output, string(output)) + }) + } +} diff --git a/pkg/nvidia-plugin/api/config/v1/imex.go b/pkg/nvidia-plugin/api/config/v1/imex.go new file mode 100644 index 000000000..928e13e85 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/imex.go @@ -0,0 +1,53 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package v1 + +import ( + "errors" + "fmt" +) + +const ( + ImexChannelEnvVar = "NVIDIA_IMEX_CHANNELS" +) + +var errInvalidImexConfig = errors.New("invalid IMEX config") + +// Imex stores the configuration options for fabric-attached devices. +type Imex struct { + // ChannelIDs defines a list of channel IDs to inject into containers that request NVIDIA devices. + // If a channel ID is specified and the associated channel device node exists, the corresponding + // channel will be added to the ContainerAllocateResponse for containers with access to NVIDIA + // devices. + ChannelIDs []int `json:"channelIDs,omitempty" yaml:"channelIDs,omitempty"` + // Required specifies whether the requested IMEX channel IDs are required or not. + // If a channel is required, it is expected to exist as the device plugin starts. + // If it is not required its injection is skipped if the device nodes do not exist or if its + // existence cannot be queried. + Required bool `json:"required,omitempty" yaml:"required,omitempty"` +} + +// AssertChannelIDsIsValid checks whether the specified list of channel IDs is valid. +func AssertChannelIDsValid(ids []int) error { + switch { + case len(ids) == 0: + return nil + case len(ids) == 1 && ids[0] == 0: + return nil + } + return fmt.Errorf("%w: channelIDs must be [] or [0]; found %v", errInvalidImexConfig, ids) +} diff --git a/pkg/nvidia-plugin/api/config/v1/imex_test.go b/pkg/nvidia-plugin/api/config/v1/imex_test.go new file mode 100644 index 000000000..7a0c72e3a --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/imex_test.go @@ -0,0 +1,83 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package v1 + +import ( + "encoding/json" + "errors" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestImexUnmarshal(t *testing.T) { + testCases := []struct { + description string + input string + expected Imex + expectedError error + }{ + { + description: "empty json", + input: "{}", + expected: Imex{}, + }, + { + description: "null channel ID is valid", + input: `{"channelIDs": null}`, + expected: Imex{}, + }, + { + description: "empty channel ID is valid", + input: `{"channelIDs": []}`, + expected: Imex{ + ChannelIDs: []int{}, + }, + }, + { + description: "single 0 channel ID is valid", + input: `{"channelIDs": [0]}`, + expected: Imex{ + ChannelIDs: []int{0}, + }, + }, + { + description: "single 0 channel ID as int is valid", + input: `{"channelIDs": [0]}`, + expected: Imex{ + ChannelIDs: []int{0}, + }, + }, + { + description: "invalid cases", + input: `{"channelIDs": [2]}`, + expected: Imex{ + ChannelIDs: []int{2}, + }, + expectedError: errInvalidImexConfig, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + var output Imex + err := json.Unmarshal([]byte(tc.input), &output) + require.ErrorIs(t, errors.Join(err, AssertChannelIDsValid(output.ChannelIDs)), tc.expectedError) + require.Equal(t, tc.expected, output) + }) + } +} diff --git a/pkg/nvidia-plugin/api/config/v1/replicas.go b/pkg/nvidia-plugin/api/config/v1/replicas.go new file mode 100644 index 000000000..f2448a195 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/replicas.go @@ -0,0 +1,355 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + "encoding/json" + "fmt" + "strconv" + "strings" + + "github.com/google/uuid" +) + +// ReplicatedResources defines generic options for replicating devices. +type ReplicatedResources struct { + RenameByDefault bool `json:"renameByDefault,omitempty" yaml:"renameByDefault,omitempty"` + FailRequestsGreaterThanOne bool `json:"failRequestsGreaterThanOne,omitempty" yaml:"failRequestsGreaterThanOne,omitempty"` + Resources []ReplicatedResource `json:"resources,omitempty" yaml:"resources,omitempty"` +} + +func (rrs *ReplicatedResources) disableResoureRenaming(logger logger, id string) { + if rrs == nil { + return + } + renameByDefault := rrs.RenameByDefault + setsNonDefaultRename := false + setsDevices := false + for i, r := range rrs.Resources { + if !renameByDefault && r.Rename != "" { + setsNonDefaultRename = true + rrs.Resources[i].Rename = "" + } + if renameByDefault && r.Rename != r.Name.DefaultSharedRename() { + setsNonDefaultRename = true + rrs.Resources[i].Rename = r.Name.DefaultSharedRename() + } + if !r.Devices.All { + setsDevices = true + rrs.Resources[i].Devices.All = true + rrs.Resources[i].Devices.Count = 0 + rrs.Resources[i].Devices.List = nil + } + } + if setsNonDefaultRename { + logger.Warningf("Setting the 'rename' field in sharing.%s.resources is not yet supported in the config. Ignoring...", id) + } + if setsDevices { + logger.Warningf("Customizing the 'devices' field in sharing.%s.resources is not yet supported in the config. Ignoring...", id) + } + +} + +func (rrs *ReplicatedResources) isReplicated() bool { + if rrs == nil { + return false + } + for _, rr := range rrs.Resources { + if rr.Replicas > 1 { + return true + } + } + return false +} + +// ReplicatedResource represents a resource to be replicated. +type ReplicatedResource struct { + Name ResourceName `json:"name" yaml:"name"` + Rename ResourceName `json:"rename,omitempty" yaml:"rename,omitempty"` + Devices ReplicatedDevices `json:"devices" yaml:"devices,flow"` + Replicas int `json:"replicas" yaml:"replicas"` +} + +// ReplicatedDevices encapsulates the set of devices that should be replicated for a given resource. +// This struct should be treated as a 'union' and only one of the fields in this struct should be set at any given time. +type ReplicatedDevices struct { + All bool + Count int + List []ReplicatedDeviceRef +} + +// ReplicatedDeviceRef can either be a full GPU index, a MIG index, or a UUID (full GPU or MIG) +type ReplicatedDeviceRef string + +// IsGPUIndex checks if a ReplicatedDeviceRef is a full GPU index +func (d ReplicatedDeviceRef) IsGPUIndex() bool { + if _, err := strconv.ParseUint(string(d), 10, 0); err != nil { + return false + } + return true +} + +// IsMigIndex checks if a ReplicatedDeviceRef is a MIG index +func (d ReplicatedDeviceRef) IsMigIndex() bool { + split := strings.SplitN(string(d), ":", 2) + if len(split) != 2 { + return false + } + for _, s := range split { + if _, err := strconv.ParseUint(s, 10, 0); err != nil { + return false + } + } + return true +} + +// IsUUID checks if a ReplicatedDeviceRef is a UUID +func (d ReplicatedDeviceRef) IsUUID() bool { + return d.IsGpuUUID() || d.IsMigUUID() +} + +// IsGpuUUID checks if a ReplicatedDeviceRef is a GPU UUID +// A GPU UUID must be of the form GPU-b1028956-cfa2-0990-bf4a-5da9abb51763 +func (d ReplicatedDeviceRef) IsGpuUUID() bool { + if !strings.HasPrefix(string(d), "GPU-") { + return false + } + _, err := uuid.Parse(strings.TrimPrefix(string(d), "GPU-")) + return err == nil +} + +// IsMigUUID checks if a ReplicatedDeviceRef is a MIG UUID +// A MIG UUID can be of one of two forms: +// - MIG-b1028956-cfa2-0990-bf4a-5da9abb51763 +// - MIG-GPU-b1028956-cfa2-0990-bf4a-5da9abb51763/3/0 +func (d ReplicatedDeviceRef) IsMigUUID() bool { + if !strings.HasPrefix(string(d), "MIG-") { + return false + } + suffix := strings.TrimPrefix(string(d), "MIG-") + _, err := uuid.Parse(suffix) + if err == nil { + return true + } + split := strings.SplitN(suffix, "/", 3) + if len(split) != 3 { + return false + } + if !ReplicatedDeviceRef(split[0]).IsGpuUUID() { + return false + } + for _, s := range split[1:] { + _, err := strconv.ParseUint(s, 10, 0) + if err != nil { + return false + } + } + return true +} + +// UnmarshalJSON unmarshals raw bytes into a 'ReplicatedResources' struct. +func (s *ReplicatedResources) UnmarshalJSON(b []byte) error { + ts := make(map[string]json.RawMessage) + err := json.Unmarshal(b, &ts) + if err != nil { + return err + } + + renameByDefault, exists := ts["renameByDefault"] + if !exists { + renameByDefault = []byte(`false`) + } + + err = json.Unmarshal(renameByDefault, &s.RenameByDefault) + if err != nil { + return err + } + + failRequestsGreaterThanOne, exists := ts["failRequestsGreaterThanOne"] + if !exists { + failRequestsGreaterThanOne = []byte(`false`) + } + + err = json.Unmarshal(failRequestsGreaterThanOne, &s.FailRequestsGreaterThanOne) + if err != nil { + return err + } + + resources, exists := ts["resources"] + if !exists { + return fmt.Errorf("no resources specified") + } + + err = json.Unmarshal(resources, &s.Resources) + if err != nil { + return err + } + + if len(s.Resources) == 0 { + return fmt.Errorf("no resources specified") + } + + for i, r := range s.Resources { + if s.RenameByDefault && r.Rename == "" { + s.Resources[i].Rename = r.Name.DefaultSharedRename() + } + } + + return nil +} + +// UnmarshalJSON unmarshals raw bytes into a 'ReplicatedResource' struct. +func (s *ReplicatedResource) UnmarshalJSON(b []byte) error { + rr := make(map[string]json.RawMessage) + err := json.Unmarshal(b, &rr) + if err != nil { + return err + } + + name, exists := rr["name"] + if !exists { + return fmt.Errorf("no resource name specified") + } + + err = json.Unmarshal(name, &s.Name) + if err != nil { + return err + } + + devices, exists := rr["devices"] + if !exists { + devices = []byte(`"all"`) + } + + err = json.Unmarshal(devices, &s.Devices) + if err != nil { + return err + } + + replicas, exists := rr["replicas"] + if !exists { + return fmt.Errorf("no replicas specified") + } + + err = json.Unmarshal(replicas, &s.Replicas) + if err != nil { + return err + } + + if s.Replicas < 2 { + return fmt.Errorf("number of replicas must be >= 2") + } + + rename, exists := rr["rename"] + if !exists { + return nil + } + + err = json.Unmarshal(rename, &s.Rename) + if err != nil { + return err + } + + return nil +} + +// UnmarshalJSON unmarshals raw bytes into a 'ReplicatedDevices' struct. +func (s *ReplicatedDevices) UnmarshalJSON(b []byte) error { + // Match the string 'all' + var str string + err := json.Unmarshal(b, &str) + if err == nil { + if str != "all" { + return fmt.Errorf("devices set as '%v' but the only valid string input is 'all'", str) + } + s.All = true + return nil + } + + // Match a count + var count int + err = json.Unmarshal(b, &count) + if err == nil { + if count <= 0 { + return fmt.Errorf("devices set as '%v' but a count of devices must be > 0", count) + } + s.Count = count + return nil + } + + // Match a list + var slice []json.RawMessage + err = json.Unmarshal(b, &slice) + if err == nil { + // For each item in the list check its format and convert it to a string (if necessary) + result := make([]ReplicatedDeviceRef, len(slice)) + for i, s := range slice { + // Match a uint as a GPU index and convert it to a string + var index uint64 + if err = json.Unmarshal(s, &index); err == nil { + result[i] = ReplicatedDeviceRef(strconv.FormatUint(index, 10)) + continue + } + // Match strings as valid entries if they are GPU indices, MIG indices, or UUIDs + var item string + if err = json.Unmarshal(s, &item); err == nil { + rd := ReplicatedDeviceRef(item) + if rd.IsGPUIndex() || rd.IsMigIndex() || rd.IsUUID() { + result[i] = rd + continue + } + } + // Treat any other entries as errors + return fmt.Errorf("unsupported type for device in devices list: %v, %T", item, item) + } + s.List = result + return nil + } + + // No matches found + return fmt.Errorf("unrecognized type for devices spec: %v", string(b)) +} + +// MarshalJSON marshals ReplicatedDevices to its raw bytes representation +func (s *ReplicatedDevices) MarshalJSON() ([]byte, error) { + if s.All { + return json.Marshal("all") + } + if s.Count > 0 { + return json.Marshal(s.Count) + } + if s.List != nil { + return json.Marshal(s.List) + } + return nil, fmt.Errorf("unmarshallable ReplicatedDevices struct: %v", s) +} diff --git a/pkg/nvidia-plugin/api/config/v1/replicas_test.go b/pkg/nvidia-plugin/api/config/v1/replicas_test.go new file mode 100644 index 000000000..7392cb385 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/replicas_test.go @@ -0,0 +1,482 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func NoErrorNewResourceName(n string) ResourceName { + rn, _ := NewResourceName(n) + return rn +} + +func TestReplicatedDeviceRef(t *testing.T) { + testCases := []struct { + input string + expected string + }{ + { + input: "0", + expected: "gpuIndex", + }, + { + input: "0:0", + expected: "migIndex", + }, + { + input: "GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c", + expected: "uuid", + }, + { + input: "MIG-3eb87630-93d5-b2b6-b8ff-9b359caf4ee2", + expected: "uuid", + }, + { + input: "MIG-GPU-662077db-fa3f-0d8f-9502-21ab0ef058a2/10/0", + expected: "uuid", + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + switch tc.expected { + case "gpuIndex": + require.True(t, ReplicatedDeviceRef(tc.input).IsGPUIndex()) + require.False(t, ReplicatedDeviceRef(tc.input).IsMigIndex()) + require.False(t, ReplicatedDeviceRef(tc.input).IsUUID()) + case "migIndex": + require.False(t, ReplicatedDeviceRef(tc.input).IsGPUIndex()) + require.True(t, ReplicatedDeviceRef(tc.input).IsMigIndex()) + require.False(t, ReplicatedDeviceRef(tc.input).IsUUID()) + case "uuid": + require.False(t, ReplicatedDeviceRef(tc.input).IsGPUIndex()) + require.False(t, ReplicatedDeviceRef(tc.input).IsMigIndex()) + require.True(t, ReplicatedDeviceRef(tc.input).IsUUID()) + } + }) + } +} + +func TestMarshalReplicatedDevices(t *testing.T) { + testCases := []struct { + input ReplicatedDevices + output string + err bool + }{ + { + input: ReplicatedDevices{}, + err: true, + }, + { + input: ReplicatedDevices{ + All: true, + }, + output: `"all"`, + }, + { + input: ReplicatedDevices{ + Count: 2, + }, + output: `2`, + }, + { + input: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"0", "0:0", "GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"}, + }, + output: `["0", "0:0", "GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"]`, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + output, err := tc.input.MarshalJSON() + if tc.err { + require.Error(t, err) + return + } + require.NoError(t, err) + require.JSONEq(t, tc.output, string(output)) + }) + } +} + +func TestUnmarshalReplicatedDevices(t *testing.T) { + testCases := []struct { + input string + output ReplicatedDevices + err bool + }{ + { + input: ``, + err: true, + }, + { + input: `"not-all"`, + err: true, + }, + { + input: `-2`, + err: true, + }, + { + input: `2.0`, + err: true, + }, + { + input: `[-1]`, + err: true, + }, + { + input: `["-1"]`, + err: true, + }, + { + input: `["invalid-UUID"]`, + err: true, + }, + { + input: `["GPU-UUID"]`, + err: true, + }, + { + input: `["MIG-UUID"]`, + err: true, + }, + { + input: `["MIG-GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"]`, + err: true, + }, + { + input: `"all"`, + output: ReplicatedDevices{ + All: true, + }, + }, + { + input: `2`, + output: ReplicatedDevices{ + Count: 2, + }, + }, + { + input: `[0]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"0"}, + }, + }, + { + input: `["0"]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"0"}, + }, + }, + { + input: `["0:0"]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"0:0"}, + }, + }, + { + input: `["GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"}, + }, + }, + { + input: `["MIG-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"MIG-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"}, + }, + }, + { + input: `["MIG-GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c/0/0"]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"MIG-GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c/0/0"}, + }, + }, + { + input: `[0, "0:0", "GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"0", "0:0", "GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"}, + }, + }, + { + input: `["0", "0:0", "GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"]`, + output: ReplicatedDevices{ + List: []ReplicatedDeviceRef{"0", "0:0", "GPU-4cf8db2d-06c0-7d70-1a51-e59b25b2c16c"}, + }, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + var output ReplicatedDevices + err := output.UnmarshalJSON([]byte(tc.input)) + if tc.err { + require.Error(t, err) + return + } + require.NoError(t, err) + require.Equal(t, tc.output, output) + }) + } +} + +func TestUnmarshalReplicatedResource(t *testing.T) { + testCases := []struct { + input string + output ReplicatedResource + err bool + }{ + { + input: ``, + err: true, + }, + { + input: `{}`, + err: true, + }, + { + input: `{ + "name": "valid", + }`, + err: true, + }, + { + input: `{ + "name": "valid", + "devices": "all", + }`, + err: true, + }, + { + input: `{ + "name": "valid", + "devices": "all", + "rename": "valid-shared", + }`, + err: true, + }, + { + input: `{ + "name": "valid", + "devices": "all", + "replicas": 2 + }`, + output: ReplicatedResource{ + Name: NoErrorNewResourceName("valid"), + Devices: ReplicatedDevices{All: true}, + Replicas: 2, + }, + }, + { + input: `{ + "name": "valid", + "devices": "all", + "replicas": 2, + "rename": "valid-shared" + }`, + output: ReplicatedResource{ + Name: NoErrorNewResourceName("valid"), + Devices: ReplicatedDevices{All: true}, + Replicas: 2, + Rename: NoErrorNewResourceName("valid-shared"), + }, + }, + { + input: `{ + "name": "valid", + "replicas": -1, + }`, + err: true, + }, + { + input: `{ + "name": "valid", + "replicas": 0, + }`, + err: true, + }, + { + input: `{ + "name": "valid", + "replicas": 2 + }`, + output: ReplicatedResource{ + Name: NoErrorNewResourceName("valid"), + Devices: ReplicatedDevices{All: true}, + Replicas: 2, + }, + }, + { + input: `{ + "name": "valid", + "replicas": 2, + "rename": "valid-shared" + }`, + output: ReplicatedResource{ + Name: NoErrorNewResourceName("valid"), + Devices: ReplicatedDevices{All: true}, + Replicas: 2, + Rename: NoErrorNewResourceName("valid-shared"), + }, + }, + { + input: `{ + "name": "$invalid$", + "replicas": 2, + "rename": "valid-shared" + }`, + err: true, + }, + { + input: `{ + "name": "valid", + "replicas": 2, + "rename": "$invalid$" + }`, + err: true, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + var output ReplicatedResource + err := output.UnmarshalJSON([]byte(tc.input)) + if tc.err { + require.Error(t, err) + return + } + require.NoError(t, err) + require.Equal(t, tc.output, output) + }) + } +} + +func TestUnmarshalReplicatedResources(t *testing.T) { + testCases := []struct { + input string + output ReplicatedResources + err bool + }{ + { + input: ``, + err: true, + }, + { + input: `{}`, + err: true, + }, + { + input: `{ + "resources": [] + }`, + err: true, + }, + { + input: `{ + "resources": [ + { + "name": "valid", + "replicas": 2 + } + ] + }`, + output: ReplicatedResources{ + Resources: []ReplicatedResource{ + { + Name: NoErrorNewResourceName("valid"), + Devices: ReplicatedDevices{All: true}, + Replicas: 2, + }, + }, + }, + }, + { + input: `{ + "resources": [ + { + "name": "valid1", + "replicas": 2 + }, + { + "name": "valid2", + "replicas": 2 + } + ] + }`, + output: ReplicatedResources{ + Resources: []ReplicatedResource{ + { + Name: NoErrorNewResourceName("valid1"), + Devices: ReplicatedDevices{All: true}, + Replicas: 2, + }, + { + Name: NoErrorNewResourceName("valid2"), + Devices: ReplicatedDevices{All: true}, + Replicas: 2, + }, + }, + }, + }, + { + input: `{ + "resources": [ + { + "name": "$invalid$", + "replicas": 2 + } + ] + }`, + err: true, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + var output ReplicatedResources + err := output.UnmarshalJSON([]byte(tc.input)) + if tc.err { + require.Error(t, err) + return + } + require.NoError(t, err) + require.Equal(t, tc.output, output) + }) + } +} diff --git a/pkg/nvidia-plugin/api/config/v1/resources.go b/pkg/nvidia-plugin/api/config/v1/resources.go new file mode 100644 index 000000000..1edb98afc --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/resources.go @@ -0,0 +1,196 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +import ( + "encoding/json" + "fmt" + "regexp" + "strings" + + k8s "k8s.io/apimachinery/pkg/api/validation" +) + +// ResourcePattern is used to match a resource name to a specific pattern +type ResourcePattern string + +// ResourceName represents a valid resource name in Kubernetes +type ResourceName string + +// Resource pairs a pattern matcher with a resource name. +type Resource struct { + Pattern ResourcePattern `json:"pattern" yaml:"pattern"` + Name ResourceName `json:"name" yaml:"name"` +} + +// Resources lists full GPUs and MIG devices separately. +type Resources struct { + GPUs []Resource `json:"gpus" yaml:"gpus"` + MIGs []Resource `json:"mig,omitempty" yaml:"mig,omitempty"` +} + +// NewResourceName builds a resource name from the standard prefix and a name. +// An error is returned if the format is incorrect. +func NewResourceName(n string) (ResourceName, error) { + if !strings.HasPrefix(n, ResourceNamePrefix+"/") { + n = ResourceNamePrefix + "/" + n + } + + if len(n) > MaxResourceNameLength { + return "", fmt.Errorf("fully-qualified resource name must be %v characters or less: %v", MaxResourceNameLength, n) + } + + _, name := ResourceName(n).Split() + invalid := k8s.NameIsDNSSubdomain(name, false) + if len(invalid) != 0 { + return "", fmt.Errorf("incorrect format for resource name '%v': %v", n, invalid) + } + + return ResourceName(n), nil +} + +// NewResource builds a resource from a name and pattern +func NewResource(pattern, name string) (*Resource, error) { + resourceName, err := NewResourceName(name) + if err != nil { + return nil, fmt.Errorf("invalid resource name: %v", err) + } + r := &Resource{ + Pattern: ResourcePattern(pattern), + Name: resourceName, + } + return r, nil +} + +// Split splits a full resource name into prefix and name +func (r ResourceName) Split() (string, string) { + split := strings.SplitN(string(r), "/", 2) + if len(split) != 2 { + return "", string(r) + } + return split[0], split[1] +} + +// DefaultSharedRename returns the default renaming to apply when this resource is shared +func (r ResourceName) DefaultSharedRename() ResourceName { + return r + DefaultSharedResourceNameSuffix +} + +// UnmarshalJSON unmarshals raw bytes into a 'Resource' struct. +func (r *Resource) UnmarshalJSON(b []byte) error { + res := make(map[string]json.RawMessage) + err := json.Unmarshal(b, &res) + if err != nil { + return err + } + + // Verify both fields set in the resource JSON + pattern, patternExists := res["pattern"] + name, nameExists := res["name"] + if !patternExists { + return fmt.Errorf("resources must have a 'pattern' field set") + } + if !nameExists { + return fmt.Errorf("resources must have a 'name' field set") + } + + // Set r.Pattern from the resource JSON + err = json.Unmarshal(pattern, &r.Pattern) + if err != nil { + return err + } + + // Set r.Name from the resource JSON + err = json.Unmarshal(name, &r.Name) + if err != nil { + return err + } + + return nil +} + +// UnmarshalJSON unmarshals raw bytes into a 'ResourceName' type. +func (r *ResourceName) UnmarshalJSON(b []byte) error { + var raw string + err := json.Unmarshal(b, &raw) + if err != nil { + return err + } + + *r, err = NewResourceName(raw) + if err != nil { + return err + } + + return nil +} + +// AddGPUResource adds a GPU resource to the list of GPU resources. +func (r *Resources) AddGPUResource(pattern, name string) error { + resource, err := NewResource(pattern, name) + if err != nil { + return err + } + r.GPUs = append(r.GPUs, *resource) + return nil +} + +// AddMIGResource adds a MIG resource to the list of MIG resources. +func (r *Resources) AddMIGResource(pattern, name string) error { + resource, err := NewResource(pattern, name) + if err != nil { + return err + } + r.MIGs = append(r.MIGs, *resource) + return nil +} + +// Matches checks if the provided string matches the ResourcePattern or not. +func (p ResourcePattern) Matches(s string) bool { + result, _ := regexp.MatchString(wildCardToRegexp(string(p)), s) + return result +} + +// wildCardToRegexp converts a wildcard pattern to a regular expression pattern. +func wildCardToRegexp(pattern string) string { + var result strings.Builder + for i, literal := range strings.Split(pattern, "*") { + // Replace * with .* + if i > 0 { + result.WriteString(".*") + } + // Quote any regular expression meta characters in the literal text. + result.WriteString(regexp.QuoteMeta(literal)) + } + return result.String() +} diff --git a/pkg/nvidia-plugin/api/config/v1/sharing.go b/pkg/nvidia-plugin/api/config/v1/sharing.go new file mode 100644 index 000000000..e7b3b9af0 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/sharing.go @@ -0,0 +1,69 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package v1 + +// Sharing encapsulates the set of sharing strategies that are supported. +type Sharing struct { + // TimeSlicing defines the set of replicas to be made for timeSlicing available resources. + TimeSlicing ReplicatedResources `json:"timeSlicing,omitempty" yaml:"timeSlicing,omitempty"` + // MPS defines the set of replicas to be shared using MPS + MPS *ReplicatedResources `json:"mps,omitempty" yaml:"mps,omitempty"` +} + +type SharingStrategy string + +const ( + SharingStrategyMPS = SharingStrategy("mps") + SharingStrategyNone = SharingStrategy("none") + SharingStrategyTimeSlicing = SharingStrategy("time-slicing") +) + +// SharingStrategy returns the active sharing strategy. +func (s *Sharing) SharingStrategy() SharingStrategy { + if s.MPS != nil && s.MPS.isReplicated() { + return SharingStrategyMPS + } + + if s.TimeSlicing.isReplicated() { + return SharingStrategyTimeSlicing + } + return SharingStrategyNone +} + +// ReplicatedResources returns the resources associated with the active sharing strategy. +func (s *Sharing) ReplicatedResources() *ReplicatedResources { + if s.MPS != nil { + return s.MPS + } + return &s.TimeSlicing +} diff --git a/pkg/nvidia-plugin/api/config/v1/strategy.go b/pkg/nvidia-plugin/api/config/v1/strategy.go new file mode 100644 index 000000000..4d39581c9 --- /dev/null +++ b/pkg/nvidia-plugin/api/config/v1/strategy.go @@ -0,0 +1,69 @@ +/* + * Copyright (c), NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1 + +import ( + "fmt" + "strings" +) + +// DeviceListStrategies defines which strategies are enabled and should +// be used when passing the device list to the container runtime. +type DeviceListStrategies map[string]bool + +// NewDeviceListStrategies constructs a new DeviceListStrategy +func NewDeviceListStrategies(strategies []string) (DeviceListStrategies, error) { + ret := map[string]bool{ + DeviceListStrategyEnvVar: false, + DeviceListStrategyVolumeMounts: false, + DeviceListStrategyCDIAnnotations: false, + DeviceListStrategyCDICRI: false, + } + for _, s := range strategies { + if _, ok := ret[s]; !ok { + return nil, fmt.Errorf("invalid strategy: %v", s) + } + ret[s] = true + } + + return DeviceListStrategies(ret), nil +} + +// Includes returns whether the given strategy is present in the set of strategies. +func (s DeviceListStrategies) Includes(strategy string) bool { + return s[strategy] +} + +// AnyCDIEnabled returns whether any of the strategies being used require CDI. +func (s DeviceListStrategies) AnyCDIEnabled() bool { + for k, v := range s { + if strings.HasPrefix(k, "cdi-") && v { + return true + } + } + return false +} + +// AllCDIEnabled returns whether all strategies being used require CDI. +func (s DeviceListStrategies) AllCDIEnabled() bool { + for k, v := range s { + if !strings.HasPrefix(k, "cdi-") && v { + return false + } + } + return true +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/main.go b/pkg/nvidia-plugin/mps-control-daemon/main.go new file mode 100644 index 000000000..29259c29c --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/main.go @@ -0,0 +1,255 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package main + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "syscall" + "time" + + "github.com/urfave/cli/v2" + "k8s.io/klog/v2" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/mps-control-daemon/mount" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/mps-control-daemon/mps" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/info" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/logger" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/watch" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +// Config represents a collection of config options for the device plugin. +type Config struct { + configFile string + + // flags stores the CLI flags for later processing. + flags []cli.Flag +} + +func main() { + config := &Config{} + + c := cli.NewApp() + c.Name = "NVIDIA MPS Control Daemon" + c.Version = info.GetVersionString() + c.Action = func(ctx *cli.Context) error { + return start(ctx, config) + } + c.Commands = []*cli.Command{ + mount.NewCommand(), + } + + config.flags = []cli.Flag{ + &cli.StringFlag{ + Name: "config-file", + Usage: "the path to a config file as an alternative to command line options or environment variables", + Destination: &config.configFile, + EnvVars: []string{"CONFIG_FILE"}, + }, + &cli.StringFlag{ + Name: "mig-strategy", + Value: spec.MigStrategyNone, + Usage: "the desired strategy for exposing MIG devices on GPUs that support it:\n\t\t[none | single | mixed]", + EnvVars: []string{"MIG_STRATEGY"}, + }, + } + c.Flags = config.flags + + klog.InfoS(c.Name, "version", c.Version) + err := c.Run(os.Args) + if err != nil { + klog.Error(err) + os.Exit(1) + } +} + +// TODO: This needs to do similar validation to the plugin. +func validateFlags(config *spec.Config) error { + return nil +} + +// loadConfig loads the config from the spec file. +func (cfg *Config) loadConfig(c *cli.Context) (*spec.Config, error) { + config, err := spec.NewConfig(c, cfg.flags) + if err != nil { + return nil, fmt.Errorf("unable to finalize config: %w", err) + } + err = validateFlags(config) + if err != nil { + return nil, fmt.Errorf("unable to validate flags: %w", err) + } + config.Flags.GFD = nil + + return config, nil +} + +// loadConfig loads the config from the spec file. +func (cfg *Config) loadNvidiaConfig(c *cli.Context) (*nvidia.DeviceConfig, error) { + devcfg := &nvidia.DeviceConfig{} + + config, err := spec.NewConfig(c, cfg.flags) + if err != nil { + return nil, fmt.Errorf("unable to finalize config: %w", err) + } + err = validateFlags(config) + if err != nil { + return nil, fmt.Errorf("unable to validate flags: %w", err) + } + config.Flags.GFD = nil + // Set the config in the device config. + devcfg.Config = config + return devcfg, nil +} + +func start(c *cli.Context, cfg *Config) error { + klog.Info("Starting OS watcher.") + sigs := watch.Signals(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) + var started bool + var restartTimeout <-chan time.Time + var daemons []*mps.Daemon +restart: + // If we are restarting, stop daemons from previous run. + if started { + err := stopDaemons(daemons...) + if err != nil { + return fmt.Errorf("error stopping plugins from previous run: %v", err) + } + } + + klog.Info("Starting Daemons.") + daemons, restartDaemons, err := startDaemons(c, cfg) + if err != nil { + return fmt.Errorf("error starting plugins: %v", err) + } + started = true + + if restartDaemons { + klog.Infof("Failed to start one or more MPS deamons. Retrying in 30s...") + restartTimeout = time.After(30 * time.Second) + } + + // Start an infinite loop, waiting for several indicators to either log + // some messages, trigger a restart of the plugins, or exit the program. + for { + select { + // If the restart timeout has expired, then restart the plugins + case <-restartTimeout: + goto restart + + // Watch for any signals from the OS. On SIGHUP, restart this loop, + // restarting all of the plugins in the process. On all other + // signals, exit the loop and exit the program. + case s := <-sigs: + switch s { + case syscall.SIGHUP: + klog.Info("Received SIGHUP, restarting.") + goto restart + default: + klog.Infof("Received signal \"%v\", shutting down.", s) + goto exit + } + } + } +exit: + if err := stopDaemons(daemons...); err != nil { + return fmt.Errorf("error stopping daemons: %v", err) + } + return nil +} + +func startDaemons(c *cli.Context, cfg *Config) ([]*mps.Daemon, bool, error) { + // Load the configuration file + klog.Info("Loading configuration.") + config, err := cfg.loadNvidiaConfig(c) + if err != nil { + return nil, false, fmt.Errorf("unable to load config: %v", err) + } + spec.DisableResourceNamingInConfig(logger.ToKlog, config.Config) + + nvmllib := nvml.New() + devicelib := device.New(nvmllib) + infolib := nvinfo.New( + nvinfo.WithNvmlLib(nvmllib), + nvinfo.WithDeviceLib(devicelib), + ) + + // Update the configuration file with default resources. + klog.Info("Updating config with default resource matching patterns.") + err = rm.AddDefaultResourcesToConfig(infolib, nvmllib, devicelib, config.Config) + if err != nil { + return nil, false, fmt.Errorf("unable to add default resources to config: %v", err) + } + + // Print the config to the output. + configJSON, err := json.MarshalIndent(config, "", " ") + if err != nil { + return nil, false, fmt.Errorf("failed to marshal config to JSON: %v", err) + } + klog.Infof("\nRunning with config:\n%v", string(configJSON)) + + // Get the set of daemons. + // Note that a daemon is only created for resources with at least one device. + klog.Info("Retrieving MPS daemons.") + mpsDaemons, err := mps.NewDaemons(infolib, nvmllib, devicelib, + mps.WithConfig(config), + ) + if err != nil { + return nil, false, fmt.Errorf("error getting daemons: %v", err) + } + + if len(mpsDaemons) == 0 { + klog.Info("No devices are configured for MPS sharing; Waiting indefinitely.") + } + + // Loop through all MPS daemons and start them. + // If any daemon fails to start, all daemons are started again. + for _, mpsDaemon := range mpsDaemons { + if err := mpsDaemon.Start(); err != nil { + klog.Errorf("Failed to start MPS daemon: %v", err) + return mpsDaemons, true, nil + } + } + readyFile, err := os.Create("/mps/.ready") + if err != nil { + return mpsDaemons, true, fmt.Errorf("failed to create .ready file") + } + defer readyFile.Close() + + return mpsDaemons, false, nil +} + +func stopDaemons(mpsDaemons ...*mps.Daemon) error { + if err := os.Remove("/mps/.ready"); err != nil { + klog.Warningf("Failed to remove .ready file: %v", err) + } + klog.Info("Stopping MPS daemons.") + var errs error + for _, p := range mpsDaemons { + errs = errors.Join(errs, p.Stop()) + } + return errs +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go b/pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go new file mode 100644 index 000000000..83825e812 --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go @@ -0,0 +1,108 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mount + +import ( + "bufio" + "fmt" + "os" + "os/exec" + "strconv" + "strings" + + "github.com/urfave/cli/v2" + "k8s.io/klog/v2" + "k8s.io/mount-utils" +) + +// NewCommand constructs a mount command. +func NewCommand() *cli.Command { + c := cli.Command{ + Name: "mount-shm", + Usage: "Set up the /dev/shm mount required by the MPS daemon", + Action: mountShm, + } + + return &c +} + +// mountShm creates a tmpfs mount at /mps/shm to be used by the mps control daemon. +func mountShm(c *cli.Context) error { + mountExecutable, err := exec.LookPath("mount") + if err != nil { + return fmt.Errorf("error finding 'mount' executable: %w", err) + } + mounter := mount.New(mountExecutable) + + // TODO: /mps should be configurable. + shmDir := "/mps/shm" + err = mount.CleanupMountPoint(shmDir, mounter, true) + if err != nil { + return fmt.Errorf("error unmounting %v: %w", shmDir, err) + } + + if err := os.MkdirAll(shmDir, 0755); err != nil { + return fmt.Errorf("error creating directory %v: %w", shmDir, err) + } + + sizeArg := fmt.Sprintf("size=%v", getDefaultShmSize()) + mountOptions := []string{"rw", "nosuid", "nodev", "noexec", "relatime", sizeArg} + if err := mounter.Mount("shm", shmDir, "tmpfs", mountOptions); err != nil { + return fmt.Errorf("error mounting %v as tmpfs: %w", shmDir, err) + } + + return nil +} + +// getDefaultShmSize returns the default size for the tmpfs to be created. +// This reads /proc/meminfo to get the total memory to calculate this. If this +// fails a fallback size of 65536k is used. +func getDefaultShmSize() string { + const fallbackSize = "65536k" + + meminfo, err := os.Open("/proc/meminfo") + if err != nil { + klog.ErrorS(err, "failed to open /proc/meminfo") + return fallbackSize + } + defer func() { + _ = meminfo.Close() + }() + + scanner := bufio.NewScanner(meminfo) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "MemTotal:") { + continue + } + + parts := strings.SplitN(strings.TrimSpace(strings.TrimPrefix(line, "MemTotal:")), " ", 2) + memTotal, err := strconv.Atoi(parts[0]) + if err != nil { + klog.ErrorS(err, "could not convert MemTotal to an integer") + return fallbackSize + } + + var unit string + if len(parts) == 2 { + unit = string(parts[1][0]) + } + + return fmt.Sprintf("%d%s", memTotal/2, unit) + } + return fallbackSize +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go b/pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go new file mode 100644 index 000000000..5d23c61ae --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go @@ -0,0 +1,280 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mps + +import ( + "bytes" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + + "github.com/opencontainers/selinux/go-selinux" + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" +) + +type computeMode string + +const ( + mpsControlBin = "nvidia-cuda-mps-control" + + computeModeExclusiveProcess = computeMode("EXCLUSIVE_PROCESS") + computeModeDefault = computeMode("DEFAULT") + + unprivilegedContainerSELinuxLabel = "system_u:object_r:container_file_t:s0" +) + +// Daemon represents an MPS daemon. +// It is associated with a specific kubernets resource and is responsible for +// starting and stopping the deamon as well as ensuring that the memory and +// thread limits are set for the devices that the resource makes available. +type Daemon struct { + rm rm.ResourceManager + // root represents the root at which the files and folders controlled by the + // daemon are created. These include the log and pipe directories. + root Root + // logTailer tails the MPS control daemon logs. + logTailer *tailer +} + +// NewDaemon creates an MPS daemon instance. +func NewDaemon(rm rm.ResourceManager, root Root) *Daemon { + return &Daemon{ + rm: rm, + root: root, + } +} + +// Devices returns the list of devices under the control of this MPS daemon. +func (d *Daemon) Devices() rm.Devices { + return d.rm.Devices() +} + +type envvars map[string]string + +func (e envvars) toSlice() []string { + var envs []string + for k, v := range e { + envs = append(envs, k+"="+v) + } + return envs +} + +// EnvVars returns the environment variables required for the daemon. +// These should be passed to clients consuming the device shared using MPS. +// TODO: Set CUDA_VISIBLE_DEVICES to include only the devices for this resource type. +func (d *Daemon) EnvVars() envvars { + return map[string]string{ + "CUDA_MPS_PIPE_DIRECTORY": d.PipeDir(), + "CUDA_MPS_LOG_DIRECTORY": d.LogDir(), + } +} + +// Start starts the MPS deamon as a background process. +func (d *Daemon) Start() error { + if err := d.setComputeMode(computeModeExclusiveProcess); err != nil { + return fmt.Errorf("error setting compute mode %v: %w", computeModeExclusiveProcess, err) + } + + klog.InfoS("Staring MPS daemon", "resource", d.rm.Resource()) + + pipeDir := d.PipeDir() + if err := os.MkdirAll(pipeDir, 0755); err != nil { + return fmt.Errorf("error creating directory %v: %w", pipeDir, err) + } + + if err := setSELinuxContext(pipeDir, unprivilegedContainerSELinuxLabel); err != nil { + return fmt.Errorf("error setting SELinux context: %w", err) + } + + logDir := d.LogDir() + if err := os.MkdirAll(logDir, 0755); err != nil { + return fmt.Errorf("error creating directory %v: %w", logDir, err) + } + + mpsDaemon := exec.Command(mpsControlBin, "-d") + mpsDaemon.Env = append(mpsDaemon.Env, d.EnvVars().toSlice()...) + if err := mpsDaemon.Run(); err != nil { + return err + } + + for index, limit := range d.perDevicePinnedDeviceMemoryLimits() { + _, err := d.EchoPipeToControl(fmt.Sprintf("set_default_device_pinned_mem_limit %s %s", index, limit)) + if err != nil { + return fmt.Errorf("error setting pinned memory limit for device %v: %w", index, err) + } + } + if threadPercentage := d.activeThreadPercentage(); threadPercentage != "" { + _, err := d.EchoPipeToControl(fmt.Sprintf("set_default_active_thread_percentage %s", threadPercentage)) + if err != nil { + return fmt.Errorf("error setting active thread percentage: %w", err) + } + } + + statusFile, err := os.Create(d.startedFile()) + if err != nil { + return err + } + defer statusFile.Close() + + d.logTailer = newTailer(filepath.Join(logDir, "control.log")) + klog.InfoS("Starting log tailer", "resource", d.rm.Resource()) + if err := d.logTailer.Start(); err != nil { + klog.ErrorS(err, "Could not start tail command on control.log; ignoring logs") + } + + return nil +} + +func setSELinuxContext(path string, context string) error { + _, err := os.Stat("/sys/fs/selinux") + if err != nil && errors.Is(err, os.ErrNotExist) { + klog.InfoS("SELinux disabled, not updating context", "path", path) + return nil + } else if err != nil { + return fmt.Errorf("error checking if SELinux is enabled: %w", err) + } + + klog.InfoS("SELinux enabled, setting context", "path", path, "context", context) + return selinux.Chcon(path, context, true) +} + +// Stop ensures that the MPS daemon is quit. +func (d *Daemon) Stop() error { + _, err := d.EchoPipeToControl("quit") + if err != nil { + return fmt.Errorf("error sending quit message: %w", err) + } + klog.InfoS("Stopped MPS control daemon", "resource", d.rm.Resource()) + + err = d.logTailer.Stop() + klog.InfoS("Stopped log tailer", "resource", d.rm.Resource(), "error", err) + + if err := d.setComputeMode(computeModeDefault); err != nil { + return fmt.Errorf("error setting compute mode %v: %w", computeModeDefault, err) + } + + if err := os.Remove(d.startedFile()); err != nil && err != os.ErrNotExist { + return fmt.Errorf("failed to remove started file: %w", err) + } + + logDir := d.LogDir() + if err := os.RemoveAll(logDir); err != nil { + klog.ErrorS(err, "Failed to remove pipe directory", "path", logDir) + } + + return nil +} + +func (d *Daemon) LogDir() string { + return d.root.LogDir(d.rm.Resource()) +} + +func (d *Daemon) PipeDir() string { + return d.root.PipeDir(d.rm.Resource()) +} + +func (d *Daemon) ShmDir() string { + return "/dev/shm" +} + +func (d *Daemon) startedFile() string { + return d.root.startedFile(d.rm.Resource()) +} + +// AssertHealthy checks that the MPS control daemon is healthy. +func (d *Daemon) AssertHealthy() error { + _, err := d.EchoPipeToControl("get_default_active_thread_percentage") + return err +} + +// EchoPipeToControl sends the specified command to the MPS control daemon. +func (d *Daemon) EchoPipeToControl(command string) (string, error) { + var out bytes.Buffer + reader, writer := io.Pipe() + defer writer.Close() + defer reader.Close() + + mpsDaemon := exec.Command(mpsControlBin) + mpsDaemon.Env = append(mpsDaemon.Env, d.EnvVars().toSlice()...) + + mpsDaemon.Stdin = reader + mpsDaemon.Stdout = &out + + if err := mpsDaemon.Start(); err != nil { + return "", fmt.Errorf("failed to start NVIDIA MPS command: %w", err) + } + + if _, err := writer.Write([]byte(command)); err != nil { + return "", fmt.Errorf("failed to write message to pipe: %w", err) + } + _ = writer.Close() + + if err := mpsDaemon.Wait(); err != nil { + return "", fmt.Errorf("failed to send command to MPS daemon: %w", err) + } + return out.String(), nil +} + +func (d *Daemon) setComputeMode(mode computeMode) error { + for _, uuid := range d.Devices().GetUUIDs() { + cmd := exec.Command( + "nvidia-smi", + "-i", uuid, + "-c", string(mode)) + output, err := cmd.CombinedOutput() + if err != nil { + klog.Errorf("\n%v", string(output)) + return fmt.Errorf("error running nvidia-smi: %w", err) + } + } + return nil +} + +// perDevicePinnedMemoryLimits returns the pinned memory limits for each device. +func (m *Daemon) perDevicePinnedDeviceMemoryLimits() map[string]string { + totalMemoryInBytesPerDevice := make(map[string]uint64) + replicasPerDevice := make(map[string]uint64) + for _, device := range m.Devices() { + index := device.Index + totalMemoryInBytesPerDevice[index] = device.TotalMemory + replicasPerDevice[index] += 1 + } + + limits := make(map[string]string) + for index, totalMemory := range totalMemoryInBytesPerDevice { + if totalMemory == 0 { + continue + } + replicas := replicasPerDevice[index] + limits[index] = fmt.Sprintf("%vM", totalMemory/replicas/1024/1024) + } + return limits +} + +func (m *Daemon) activeThreadPercentage() string { + if len(m.Devices()) == 0 { + return "" + } + replicasPerDevice := len(m.Devices()) / len(m.Devices().GetUUIDs()) + + return fmt.Sprintf("%d", 100/replicasPerDevice) +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/device.go b/pkg/nvidia-plugin/mps-control-daemon/mps/device.go new file mode 100644 index 000000000..bd8b1bf3c --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mps/device.go @@ -0,0 +1,55 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mps + +import ( + "errors" + "fmt" + "strings" + + "golang.org/x/mod/semver" + + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" +) + +var errInvalidDevice = errors.New("invalid device") + +// mpsDevice represents an MPS-specific alias for an rm.Device. +type mpsDevice rm.Device + +// assertReplicas checks whether the number of replicas specified is valid. +func (d *mpsDevice) assertReplicas() error { + maxClients := d.maxClients() + if d.Replicas > maxClients { + return fmt.Errorf("%w maximum allowed replicas exceeded: %d > %d", errInvalidDevice, d.Replicas, maxClients) + } + return nil +} + +// maxClients returns the maximum number of clients supported by an MPS server. +func (d *mpsDevice) maxClients() int { + if d.isAtLeastVolta() { + return 48 + } + return 16 +} + +// isAtLeastVolta checks whether the specified device is a volta device or newer. +func (d *mpsDevice) isAtLeastVolta() bool { + vCc := "v" + strings.TrimPrefix(d.ComputeCapability, "v") + return semver.Compare(semver.Canonical(vCc), semver.Canonical("v7.5")) >= 0 +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go b/pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go new file mode 100644 index 000000000..17cef28ea --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go @@ -0,0 +1,112 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mps + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDevice(t *testing.T) { + testCases := []struct { + description string + input mpsDevice + expectedIsAtLeastVolta bool + expectedMaxClients int + expectedAssertReplicas error + }{ + { + description: "leading v ignored", + input: mpsDevice{ + ComputeCapability: "v7.5", + }, + expectedIsAtLeastVolta: true, + expectedMaxClients: 48, + }, + { + description: "no-leading v supported", + input: mpsDevice{ + ComputeCapability: "7.5", + }, + expectedIsAtLeastVolta: true, + expectedMaxClients: 48, + }, + { + description: "pre-volta clients", + input: mpsDevice{ + ComputeCapability: "7.0", + }, + expectedIsAtLeastVolta: false, + expectedMaxClients: 16, + }, + { + description: "post-volta clients", + input: mpsDevice{ + ComputeCapability: "9.0", + }, + expectedIsAtLeastVolta: true, + expectedMaxClients: 48, + }, + { + description: "pre-volta clients exceeded", + input: mpsDevice{ + ComputeCapability: "7.0", + Replicas: 29, + }, + expectedIsAtLeastVolta: false, + expectedMaxClients: 16, + expectedAssertReplicas: errInvalidDevice, + }, + { + description: "post-volta clients exceeded", + input: mpsDevice{ + ComputeCapability: "9.0", + Replicas: 49, + }, + expectedIsAtLeastVolta: true, + expectedMaxClients: 48, + expectedAssertReplicas: errInvalidDevice, + }, + { + description: "pre-volta clients max", + input: mpsDevice{ + ComputeCapability: "7.0", + Replicas: 16, + }, + expectedIsAtLeastVolta: false, + expectedMaxClients: 16, + }, + { + description: "post-volta clients max", + input: mpsDevice{ + ComputeCapability: "9.0", + Replicas: 48, + }, + expectedIsAtLeastVolta: true, + expectedMaxClients: 48, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + require.Equal(t, tc.expectedIsAtLeastVolta, tc.input.isAtLeastVolta()) + require.Equal(t, tc.expectedMaxClients, tc.input.maxClients()) + require.ErrorIs(t, tc.input.assertReplicas(), tc.expectedAssertReplicas) + }) + } +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go b/pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go new file mode 100644 index 000000000..d9fb87b84 --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go @@ -0,0 +1,69 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mps + +import ( + "context" + "os" + "os/exec" +) + +// tailer tails the contents of a file. +type tailer struct { + filename string + cmd *exec.Cmd + cancel context.CancelFunc +} + +// newTailer creates a tailer. +func newTailer(filename string) *tailer { + return &tailer{ + filename: filename, + } +} + +// Start starts tailing the specified filename. +func (t *tailer) Start() error { + ctx, cancel := context.WithCancel(context.Background()) + t.cancel = cancel + + //nolint:gosec // G204: Subprocess launched with a potential tainted input or cmd arguments (gosec) + cmd := exec.CommandContext(ctx, "tail", "-n", "+1", "-f", t.filename) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Start(); err != nil { + return err + } + t.cmd = cmd + return nil +} + +// Stop stops the tailer. +// The associated cancel function is called after which the command wait is +// called -- if applicable. +func (t *tailer) Stop() error { + if t.cancel != nil { + t.cancel() + } + + if t.cmd == nil { + return nil + } + + return t.cmd.Wait() +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/manager.go b/pkg/nvidia-plugin/mps-control-daemon/mps/manager.go new file mode 100644 index 000000000..719a358e6 --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mps/manager.go @@ -0,0 +1,112 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mps + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" +) + +type Manager interface { + Daemons() ([]*Daemon, error) +} + +type manager struct { + infolib info.Interface + nvmllib nvml.Interface + devicelib device.Interface + config *nvidia.DeviceConfig +} + +type nullManager struct{} + +// Daemons creates the required set of MPS daemons for the specified options. +func NewDaemons(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, opts ...Option) ([]*Daemon, error) { + manager, err := New(infolib, nvmllib, devicelib, opts...) + if err != nil { + return nil, fmt.Errorf("failed to create MPS manager: %w", err) + } + return manager.Daemons() +} + +// New creates a manager for MPS daemons. +// If MPS is not configured, a manager is returned that manages no daemons. +func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, opts ...Option) (Manager, error) { + m := &manager{ + infolib: infolib, + nvmllib: nvmllib, + devicelib: devicelib, + } + for _, opt := range opts { + opt(m) + } + + if strategy := m.config.Sharing.SharingStrategy(); strategy != spec.SharingStrategyMPS { + klog.InfoS("Sharing strategy is not MPS; skipping MPS manager creation", "strategy", strategy) + return &nullManager{}, nil + } + + return m, nil +} + +func (m *manager) Daemons() ([]*Daemon, error) { + resourceManagers, err := rm.NewNVMLResourceManagers(m.infolib, m.nvmllib, m.devicelib, m.config) + if err != nil { + return nil, err + } + var daemons []*Daemon + for _, resourceManager := range resourceManagers { + // We don't create daemons if there are no devices associated with the resource manager. + if len(resourceManager.Devices()) == 0 { + klog.InfoS("No devices associated with resource", "resource", resourceManager.Resource()) + continue + } + // Check if the resources are shared. + // TODO: We should add a more explicit check for MPS specifically + if !rm.AnnotatedIDs(resourceManager.Devices().GetIDs()).AnyHasAnnotations() { + klog.InfoS("Resource is not shared", "resource", "resource", resourceManager.Resource()) + continue + } + // Check if MIG devices are included. + for _, rmDevice := range resourceManager.Devices() { + if rmDevice.IsMigDevice() { + klog.Warning("MPS sharing is not supported for MIG devices; skipping daemon creation") + continue + } + if err := (*mpsDevice)(rmDevice).assertReplicas(); err != nil { + return nil, fmt.Errorf("invalid MPS configuration: %w", err) + } + } + daemon := NewDaemon(resourceManager, ContainerRoot) + daemons = append(daemons, daemon) + } + + return daemons, nil +} + +// Daemons always returns an empty slice for a nullManager. +func (m *nullManager) Daemons() ([]*Daemon, error) { + return nil, nil +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/options.go b/pkg/nvidia-plugin/mps-control-daemon/mps/options.go new file mode 100644 index 000000000..ca97d122f --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mps/options.go @@ -0,0 +1,29 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mps + +import "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + +// Option defines a functional option for configuring an MPS manager. +type Option func(*manager) + +// WithConfig sets the config associated with the MPS manager. +func WithConfig(config *nvidia.DeviceConfig) Option { + return func(m *manager) { + m.config = config + } +} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/root.go b/pkg/nvidia-plugin/mps-control-daemon/mps/root.go new file mode 100644 index 000000000..9c2e105f8 --- /dev/null +++ b/pkg/nvidia-plugin/mps-control-daemon/mps/root.go @@ -0,0 +1,59 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mps + +import ( + "path/filepath" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +const ( + ContainerRoot = Root("/mps") +) + +// Root represents an MPS root. +// This is where per-resource pipe and log directories are created. +// For containerised applications the host root is typically mounted to /mps in the container. +type Root string + +// LogDir returns the per-resource pipe dir for the specified root. +func (r Root) LogDir(resourceName spec.ResourceName) string { + return r.Path(string(resourceName), "log") +} + +// PipeDir returns the per-resource pipe dir for the specified root. +func (r Root) PipeDir(resourceName spec.ResourceName) string { + return r.Path(string(resourceName), "pipe") +} + +// ShmDir returns the shm dir associated with the root. +// Note that the shm dir is the same for all resources. +func (r Root) ShmDir(resourceName spec.ResourceName) string { + return r.Path("shm") +} + +// startedFile returns the per-resource .started file name for the specified root. +func (r Root) startedFile(resourceName spec.ResourceName) string { + return r.Path(string(resourceName), ".started") +} + +// Path returns a path relative to the MPS root. +func (r Root) Path(parts ...string) string { + pathparts := append([]string{string(r)}, parts...) + return filepath.Join(pathparts...) +} diff --git a/pkg/nvidia-plugin/pkg/cdi/api.go b/pkg/nvidia-plugin/pkg/cdi/api.go new file mode 100644 index 000000000..29ea99ce7 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cdi/api.go @@ -0,0 +1,31 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cdi + +import "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" + +// Interface provides the API to the 'cdi' package +// +//go:generate moq -stub -out api_mock.go . Interface +type Interface interface { + CreateSpecFile() error + QualifiedName(string, string) string +} + +type cdiSpecGenerator interface { + GetSpec() (spec.Interface, error) +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api_mock.go b/pkg/nvidia-plugin/pkg/cdi/api_mock.go similarity index 100% rename from pkg/device-plugin/nvidiadevice/nvinternal/cdi/api_mock.go rename to pkg/nvidia-plugin/pkg/cdi/api_mock.go diff --git a/pkg/nvidia-plugin/pkg/cdi/cdi.go b/pkg/nvidia-plugin/pkg/cdi/cdi.go new file mode 100644 index 000000000..b3227d437 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cdi/cdi.go @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cdi + +import ( + "fmt" + "path/filepath" + "strings" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform" + transformroot "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform/root" + "github.com/sirupsen/logrus" + "k8s.io/klog/v2" + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" + cdiparser "tags.cncf.io/container-device-interface/pkg/parser" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" +) + +const ( + cdiRoot = "/var/run/cdi" +) + +// cdiHandler creates CDI specs for devices assocatied with the device plugin +type cdiHandler struct { + infolib info.Interface + nvmllib nvml.Interface + devicelib device.Interface + + logger *logrus.Logger + driverRoot string + devRoot string + targetDriverRoot string + targetDevRoot string + nvidiaCTKPath string + vendor string + deviceIDStrategy string + + deviceListStrategies spec.DeviceListStrategies + + gdsEnabled bool + mofedEnabled bool + + imexChannels imex.Channels + + cdilibs map[string]cdiSpecGenerator +} + +var _ Interface = &cdiHandler{} + +// New constructs a new instance of the 'cdi' interface +func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, opts ...Option) (Interface, error) { + c := &cdiHandler{ + infolib: infolib, + nvmllib: nvmllib, + devicelib: devicelib, + } + for _, opt := range opts { + opt(c) + } + + if !c.deviceListStrategies.AnyCDIEnabled() { + return &null{}, nil + } + hasNVML, _ := infolib.HasNvml() + if !hasNVML { + klog.Warning("No valid resources detected, creating a null CDI handler") + return &null{}, nil + } + + if c.logger == nil { + c.logger = logrus.StandardLogger() + } + if c.deviceIDStrategy == "" { + c.deviceIDStrategy = "uuid" + } + if c.driverRoot == "" { + c.driverRoot = "/" + } + if c.devRoot == "" { + c.devRoot = c.driverRoot + } + if c.targetDriverRoot == "" { + c.targetDriverRoot = c.driverRoot + } + if c.targetDevRoot == "" { + c.targetDevRoot = c.devRoot + } + + deviceNamer, err := nvcdi.NewDeviceNamer(c.deviceIDStrategy) + if err != nil { + return nil, err + } + + c.cdilibs = make(map[string]cdiSpecGenerator) + + c.cdilibs["gpu"], err = nvcdi.New( + nvcdi.WithInfoLib(c.infolib), + nvcdi.WithNvmlLib(c.nvmllib), + nvcdi.WithDeviceLib(c.devicelib), + nvcdi.WithLogger(c.logger), + nvcdi.WithNVIDIACDIHookPath(c.nvidiaCTKPath), + nvcdi.WithDriverRoot(c.driverRoot), + nvcdi.WithDevRoot(c.devRoot), + nvcdi.WithDeviceNamers(deviceNamer), + nvcdi.WithVendor(c.vendor), + nvcdi.WithClass("gpu"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create nvcdi library: %v", err) + } + + if len(c.imexChannels) > 0 { + c.cdilibs["imex-channel"] = c.newImexChannelSpecGenerator() + } + + var additionalModes []string + if c.gdsEnabled { + additionalModes = append(additionalModes, "gds") + } + if c.mofedEnabled { + additionalModes = append(additionalModes, "mofed") + } + + for _, mode := range additionalModes { + lib, err := nvcdi.New( + nvcdi.WithInfoLib(c.infolib), + nvcdi.WithLogger(c.logger), + nvcdi.WithNVIDIACDIHookPath(c.nvidiaCTKPath), + nvcdi.WithDriverRoot(c.driverRoot), + nvcdi.WithDevRoot(c.devRoot), + nvcdi.WithVendor(c.vendor), + nvcdi.WithMode(mode), + ) + if err != nil { + return nil, fmt.Errorf("failed to create nvcdi library: %v", err) + } + c.cdilibs[mode] = lib + } + + return c, nil +} + +// CreateSpecFile creates a CDI spec file for the specified devices. +func (cdi *cdiHandler) CreateSpecFile() error { + for class, cdilib := range cdi.cdilibs { + cdi.logger.Infof("Generating CDI spec for resource: %s/%s", cdi.vendor, class) + + if class == "gpu" { + ret := cdi.nvmllib.Init() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to initialize NVML: %v", ret) + } + defer func() { + _ = cdi.nvmllib.Shutdown() + }() + } + + spec, err := cdilib.GetSpec() + if err != nil { + return fmt.Errorf("failed to get CDI spec: %v", err) + } + + // TODO: Once the NewDriverTransformer is merged in container-toolkit we can instantiate it directly. + transformer := cdi.getRootTransformer() + if err := transformer.Transform(spec.Raw()); err != nil { + return fmt.Errorf("failed to transform driver root in CDI spec: %v", err) + } + + specName, err := cdiapi.GenerateNameForSpec(spec.Raw()) + if err != nil { + return fmt.Errorf("failed to generate spec name: %v", err) + } + + err = spec.Save(filepath.Join(cdiRoot, specName+".json")) + if err != nil { + return fmt.Errorf("failed to save CDI spec: %v", err) + } + } + + return nil +} + +func (cdi *cdiHandler) getRootTransformer() transform.Transformer { + driverRootTransformer := transformroot.New( + transformroot.WithRoot(cdi.driverRoot), + transformroot.WithTargetRoot(cdi.targetDriverRoot), + transformroot.WithRelativeTo("host"), + ) + + if cdi.devRoot == cdi.driverRoot || cdi.devRoot == "" { + return driverRootTransformer + } + + ensureDev := func(p string) string { + return filepath.Join(strings.TrimSuffix(filepath.Clean(p), "/dev"), "/dev") + } + + devRootTransformer := transformroot.New( + transformroot.WithRoot(ensureDev(cdi.devRoot)), + transformroot.WithTargetRoot(ensureDev(cdi.targetDevRoot)), + transformroot.WithRelativeTo("host"), + ) + + return transform.Merge(driverRootTransformer, devRootTransformer) +} + +// QualifiedName constructs a CDI qualified device name for the specified resources. +// Note: This assumes that the specified id matches the device name returned by the naming strategy. +func (cdi *cdiHandler) QualifiedName(class string, id string) string { + return cdiparser.QualifiedName(cdi.vendor, class, id) +} diff --git a/pkg/nvidia-plugin/pkg/cdi/imex.go b/pkg/nvidia-plugin/pkg/cdi/imex.go new file mode 100644 index 000000000..38aaa0a41 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cdi/imex.go @@ -0,0 +1,63 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cdi + +import ( + "tags.cncf.io/container-device-interface/specs-go" + + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" + + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" +) + +type imexChannelCDILib struct { + vendor string + imexChannels imex.Channels +} + +func (cdi *cdiHandler) newImexChannelSpecGenerator() cdiSpecGenerator { + lib := &imexChannelCDILib{ + vendor: cdi.vendor, + imexChannels: cdi.imexChannels, + } + + return lib +} + +// GetSpec returns the CDI specs for IMEX channels. +func (l *imexChannelCDILib) GetSpec() (spec.Interface, error) { + var deviceSpecs []specs.Device + for _, channel := range l.imexChannels { + deviceSpec := specs.Device{ + Name: channel.ID, + ContainerEdits: specs.ContainerEdits{ + DeviceNodes: []*specs.DeviceNode{ + { + Path: channel.Path, + HostPath: channel.HostPath, + }, + }, + }, + } + deviceSpecs = append(deviceSpecs, deviceSpec) + } + return spec.New( + spec.WithDeviceSpecs(deviceSpecs), + spec.WithVendor(l.vendor), + spec.WithClass("imex-channel"), + ) +} diff --git a/pkg/nvidia-plugin/pkg/cdi/null.go b/pkg/nvidia-plugin/pkg/cdi/null.go new file mode 100644 index 000000000..16ccead0a --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cdi/null.go @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cdi + +import ( + "k8s.io/klog/v2" +) + +type null struct{} + +var _ Interface = &null{} + +// NewNullHandler returns an instance of the 'cdi' interface that can +// be used when CDI specs are not required. +func NewNullHandler() Interface { + return &null{} +} + +// CreateSpecFile is a no-op for the null handler. +func (n *null) CreateSpecFile() error { + return nil +} + +// QualifiedName is a no-op for the null handler. A error message is logged +// inidicating this should never be called for the null handler. +func (n *null) QualifiedName(class string, id string) string { + klog.Error("cannot return a qualified CDI device name with the null CDI handler") + return "" +} diff --git a/pkg/nvidia-plugin/pkg/cdi/options.go b/pkg/nvidia-plugin/pkg/cdi/options.go new file mode 100644 index 000000000..392e744a6 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cdi/options.go @@ -0,0 +1,102 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cdi + +import ( + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" +) + +// Option defines a function for passing options to the New() call +type Option func(*cdiHandler) + +// WithDeviceListStrategies provides an Option to set the enabled flag used by the 'cdi' interface +func WithDeviceListStrategies(deviceListStrategies spec.DeviceListStrategies) Option { + return func(c *cdiHandler) { + c.deviceListStrategies = deviceListStrategies + } +} + +// WithDriverRoot provides an Option to set the driver root used by the 'cdi' interface. +func WithDriverRoot(root string) Option { + return func(c *cdiHandler) { + c.driverRoot = root + } +} + +// WithDevRoot sets the dev root for the `cdi` interface. +func WithDevRoot(root string) Option { + return func(c *cdiHandler) { + c.devRoot = root + } +} + +// WithTargetDriverRoot provides an Option to set the target (host) driver root used by the 'cdi' interface +func WithTargetDriverRoot(root string) Option { + return func(c *cdiHandler) { + c.targetDriverRoot = root + } +} + +// WithTargetDevRoot provides an Option to set the target (host) dev root used by the 'cdi' interface +func WithTargetDevRoot(root string) Option { + return func(c *cdiHandler) { + c.targetDevRoot = root + } +} + +// WithNvidiaCTKPath provides an Option to set the nvidia-ctk path used by the 'cdi' interface +func WithNvidiaCTKPath(path string) Option { + return func(c *cdiHandler) { + c.nvidiaCTKPath = path + } +} + +// WithDeviceIDStrategy provides an Option to set the device ID strategy used by the 'cdi' interface +func WithDeviceIDStrategy(strategy string) Option { + return func(c *cdiHandler) { + c.deviceIDStrategy = strategy + } +} + +// WithVendor provides an Option to set the vendor used by the 'cdi' interface +func WithVendor(vendor string) Option { + return func(c *cdiHandler) { + c.vendor = vendor + } +} + +// WithGdsEnabled provides and option to set whether a GDS CDI spec should be generated +func WithGdsEnabled(enabled bool) Option { + return func(c *cdiHandler) { + c.gdsEnabled = enabled + } +} + +// WithMofedEnabled provides and option to set whether a MOFED CDI spec should be generated +func WithMofedEnabled(enabled bool) Option { + return func(c *cdiHandler) { + c.mofedEnabled = enabled + } +} + +// WithImexChannels sets the IMEX channels for which CDI specs should be generated. +func WithImexChannels(imexChannels imex.Channels) Option { + return func(c *cdiHandler) { + c.imexChannels = imexChannels + } +} diff --git a/pkg/nvidia-plugin/pkg/cuda/api.go b/pkg/nvidia-plugin/pkg/cuda/api.go new file mode 100644 index 000000000..e43ce4a2b --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cuda/api.go @@ -0,0 +1,119 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cuda + +import ( + "github.com/NVIDIA/go-nvml/pkg/dl" +) + +const ( + libraryName = "libcuda.so.1" + libraryLoadFlags = dl.RTLD_LAZY | dl.RTLD_GLOBAL +) + +// cuda stores a reference the cuda dynamic library +var cuda *dl.DynamicLibrary + +// Init calls cuInit and initialized the library +func Init() Result { + lib := dl.New(libraryName, libraryLoadFlags) + if err := lib.Open(); err != nil { + return ERROR_UNKNOWN + } + cuda = lib + + if err := cuda.Lookup("cuInit"); err != nil { + return ERROR_UNKNOWN + } + + return cuInit(0) +} + +// Shutdown ensures that the CUDA library is unloaded. +func Shutdown() Result { + if cuda == nil { + return SUCCESS + } + if err := cuda.Close(); err != nil { + return ERROR_UNKNOWN + } + return SUCCESS +} + +// DriverGetVersion returns the driver version as an int. +func DriverGetVersion() (int, Result) { + var version int32 + r := cuDriverGetVersion(&version) + + return int(version), r +} + +// DeviceGet returns the device with the specified index. +func DeviceGet(index int) (Device, Result) { + var device Device + //nolint:gosec // Since index is internal-only, we ignore possible overflow errors here. + r := cuDeviceGet(&device, int32(index)) + + return device, r +} + +// DeviceGetAttribute returns the specified attribute for the specified device. +func DeviceGetAttribute(attribute DeviceAttribute, device Device) (int, Result) { + var value int32 + r := cuDeviceGetAttribute(&value, attribute, device) + return int(value), r +} + +// DeviceGetCount returns the number of CUDA-capable devices available +func DeviceGetCount() (int, Result) { + var count int32 + r := cuDeviceGetCount(&count) + return int(count), r +} + +// GetAttribute converts the DeviceGetAttribute function to a device method +func (device Device) GetAttribute(attribute DeviceAttribute) (int, Result) { + return DeviceGetAttribute(attribute, device) +} + +// DeviceGetName returns the name of the specified device. +func DeviceGetName(device Device) (string, Result) { + len := int32(96) + name := make([]byte, len) + + r := cuDeviceGetName(&name[0], len, device) + + return string(name[:clen(name)]), r +} + +// GetName converts the DeviceGetname function to a device method +func (device Device) GetName() (string, Result) { + return DeviceGetName(device) +} + +// DeviceTotalMem returns the total memory for the specified device +func DeviceTotalMem(device Device) (uint64, Result) { + var bytes uint64 + r := cuDeviceTotalMem(&bytes, device) + + return bytes, r +} + +// TotalMem converts the DeviceTotalMem function to a device method +func (device Device) TotalMem() (uint64, Result) { + return DeviceTotalMem(device) +} diff --git a/pkg/nvidia-plugin/pkg/cuda/cgo_helpers.go b/pkg/nvidia-plugin/pkg/cuda/cgo_helpers.go new file mode 100644 index 000000000..24fe6b8e7 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cuda/cgo_helpers.go @@ -0,0 +1,27 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cuda + +// clen return the length of a C string stored in a byte slice +func clen(n []byte) int { + for i := 0; i < len(n); i++ { + if n[i] == 0 { + return i + } + } + return len(n) +} diff --git a/pkg/nvidia-plugin/pkg/cuda/consts.go b/pkg/nvidia-plugin/pkg/cuda/consts.go new file mode 100644 index 000000000..f392ebf11 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cuda/consts.go @@ -0,0 +1,95 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cuda + +// Result represents the CUresult return type. +type Result int32 + +const ( + SUCCESS Result = 0 + ERROR_INVALID_VALUE Result = 1 + ERROR_OUT_OF_MEMORY Result = 2 + ERROR_NOT_INITIALIZED Result = 3 + ERROR_DEINITIALIZED Result = 4 + ERROR_PROFILER_DISABLED Result = 5 + ERROR_PROFILER_NOT_INITIALIZED Result = 6 + ERROR_PROFILER_ALREADY_STARTED Result = 7 + ERROR_PROFILER_ALREADY_STOPPED Result = 8 + ERROR_NO_DEVICE Result = 100 + ERROR_INVALID_DEVICE Result = 101 + ERROR_INVALID_IMAGE Result = 200 + ERROR_INVALID_CONTEXT Result = 201 + ERROR_CONTEXT_ALREADY_CURRENT Result = 202 + ERROR_MAP_FAILED Result = 205 + ERROR_UNMAP_FAILED Result = 206 + ERROR_ARRAY_IS_MAPPED Result = 207 + ERROR_ALREADY_MAPPED Result = 208 + ERROR_NO_BINARY_FOR_GPU Result = 209 + ERROR_ALREADY_ACQUIRED Result = 210 + ERROR_NOT_MAPPED Result = 211 + ERROR_NOT_MAPPED_AS_ARRAY Result = 212 + ERROR_NOT_MAPPED_AS_POINTER Result = 213 + ERROR_ECC_UNCORRECTABLE Result = 214 + ERROR_UNSUPPORTED_LIMIT Result = 215 + ERROR_CONTEXT_ALREADY_IN_USE Result = 216 + ERROR_PEER_ACCESS_UNSUPPORTED Result = 217 + ERROR_INVALID_PTX Result = 218 + ERROR_INVALID_GRAPHICS_CONTEXT Result = 219 + ERROR_NVLINK_UNCORRECTABLE Result = 220 + ERROR_JIT_COMPILER_NOT_FOUND Result = 221 + ERROR_INVALID_SOURCE Result = 300 + ERROR_FILE_NOT_FOUND Result = 301 + ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = 302 + ERROR_SHARED_OBJECT_INIT_FAILED Result = 303 + ERROR_OPERATING_SYSTEM Result = 304 + ERROR_INVALID_HANDLE Result = 400 + ERROR_NOT_FOUND Result = 500 + ERROR_NOT_READY Result = 600 + ERROR_ILLEGAL_ADDRESS Result = 700 + ERROR_LAUNCH_OUT_OF_RESOURCES Result = 701 + ERROR_LAUNCH_TIMEOUT Result = 702 + ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = 703 + ERROR_PEER_ACCESS_ALREADY_ENABLED Result = 704 + ERROR_PEER_ACCESS_NOT_ENABLED Result = 705 + ERROR_PRIMARY_CONTEXT_ACTIVE Result = 708 + ERROR_CONTEXT_IS_DESTROYED Result = 709 + ERROR_ASSERT Result = 710 + ERROR_TOO_MANY_PEERS Result = 711 + ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = 712 + ERROR_HOST_MEMORY_NOT_REGISTERED Result = 713 + ERROR_HARDWARE_STACK_ERROR Result = 714 + ERROR_ILLEGAL_INSTRUCTION Result = 715 + ERROR_MISALIGNED_ADDRESS Result = 716 + ERROR_INVALID_ADDRESS_SPACE Result = 717 + ERROR_INVALID_PC Result = 718 + ERROR_LAUNCH_FAILED Result = 719 + ERROR_COOPERATIVE_LAUNCH_TOO_LARGE Result = 720 + ERROR_NOT_PERMITTED Result = 800 + ERROR_NOT_SUPPORTED Result = 801 + ERROR_UNKNOWN Result = 99 +) + +// DeviceAttribute represents the CUdevice_attribute type +type DeviceAttribute int32 + +const ( + COMPUTE_CAPABILITY_MAJOR DeviceAttribute = 75 + COMPUTE_CAPABILITY_MINOR DeviceAttribute = 76 +) + +// Device represents a CUDA device handle +type Device int32 diff --git a/pkg/nvidia-plugin/pkg/cuda/cuda.go b/pkg/nvidia-plugin/pkg/cuda/cuda.go new file mode 100644 index 000000000..f44004923 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cuda/cuda.go @@ -0,0 +1,176 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cuda + +import ( + "unsafe" +) + +/* +#cgo linux LDFLAGS: -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#cgo darwin LDFLAGS: -Wl,-undefined,dynamic_lookup + +#ifdef _WIN32 +#define CUDAAPI __stdcall +#else +#define CUDAAPI +#endif + +typedef int CUdevice; + +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76 +} CUdevice_attribute; + +typedef enum cudaError_enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, + CUDA_ERROR_OUT_OF_MEMORY = 2, + CUDA_ERROR_NOT_INITIALIZED = 3, + CUDA_ERROR_DEINITIALIZED = 4, + CUDA_ERROR_PROFILER_DISABLED = 5, + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + CUDA_ERROR_NO_DEVICE = 100, + CUDA_ERROR_INVALID_DEVICE = 101, + CUDA_ERROR_INVALID_IMAGE = 200, + CUDA_ERROR_INVALID_CONTEXT = 201, + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, + CUDA_ERROR_MAP_FAILED = 205, + CUDA_ERROR_UNMAP_FAILED = 206, + CUDA_ERROR_ARRAY_IS_MAPPED = 207, + CUDA_ERROR_ALREADY_MAPPED = 208, + CUDA_ERROR_NO_BINARY_FOR_GPU = 209, + CUDA_ERROR_ALREADY_ACQUIRED = 210, + CUDA_ERROR_NOT_MAPPED = 211, + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, + CUDA_ERROR_ECC_UNCORRECTABLE = 214, + CUDA_ERROR_UNSUPPORTED_LIMIT = 215, + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, + CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, + CUDA_ERROR_INVALID_PTX = 218, + CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, + CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, + CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, + CUDA_ERROR_INVALID_SOURCE = 300, + CUDA_ERROR_FILE_NOT_FOUND = 301, + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, + CUDA_ERROR_OPERATING_SYSTEM = 304, + CUDA_ERROR_INVALID_HANDLE = 400, + CUDA_ERROR_NOT_FOUND = 500, + CUDA_ERROR_NOT_READY = 600, + CUDA_ERROR_ILLEGAL_ADDRESS = 700, + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, + CUDA_ERROR_LAUNCH_TIMEOUT = 702, + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, + CUDA_ERROR_ASSERT = 710, + CUDA_ERROR_TOO_MANY_PEERS = 711, + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, + CUDA_ERROR_HARDWARE_STACK_ERROR = 714, + CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, + CUDA_ERROR_MISALIGNED_ADDRESS = 716, + CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, + CUDA_ERROR_INVALID_PC = 718, + CUDA_ERROR_LAUNCH_FAILED = 719, + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, + CUDA_ERROR_NOT_PERMITTED = 800, + CUDA_ERROR_NOT_SUPPORTED = 801, + CUDA_ERROR_UNKNOWN = 999 +} CUresult; + +CUresult CUDAAPI cuInit(unsigned int Flags); +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); +CUresult CUDAAPI cuDeviceGetCount(int *count); +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); +*/ +import "C" + +// cuInit function as declared in cuda.h +func cuInit(flags uint32) Result { + cFlags := (C.uint)(flags) + _ret := C.cuInit(cFlags) + + return Result(_ret) +} + +// cuDeviceGet function as declared in cuda.h +func cuDeviceGet(device *Device, index int32) Result { + cDevice := (*C.CUdevice)(unsafe.Pointer(device)) + cIndex := (C.int)(index) + + _ret := C.cuDeviceGet(cDevice, cIndex) + + return Result(_ret) +} + +// cuDeviceGetAttribute function as declared in cuda.h +func cuDeviceGetAttribute(value *int32, attribute DeviceAttribute, dev Device) Result { + cValue := (*C.int)(unsafe.Pointer(value)) + cAttribute := (C.CUdevice_attribute)(attribute) + cDev := (C.CUdevice)(dev) + + _ret := C.cuDeviceGetAttribute(cValue, cAttribute, cDev) + + return Result(_ret) +} + +// cuDeviceGetCount function as declared in cuda.h +func cuDeviceGetCount(count *int32) Result { + cCount := (*C.int)(unsafe.Pointer(count)) + _ret := C.cuDeviceGetCount(cCount) + + return Result(_ret) +} + +// cuDriverGetVersion function as declared in cuda.h +func cuDriverGetVersion(version *int32) Result { + cVersion := (*C.int)(version) + _ret := C.cuDriverGetVersion(cVersion) + + return Result(_ret) +} + +// cuDeviceTotalMem function as declared in cuda.h +func cuDeviceTotalMem(bytes *uint64, dev Device) Result { + cBytes := (*C.size_t)(unsafe.Pointer(bytes)) + cDev := (C.CUdevice)(dev) + _ret := C.cuDeviceTotalMem(cBytes, cDev) + + return Result(_ret) +} + +// cuDeviceGetName function as declared in cuda.h +func cuDeviceGetName(name *byte, len int32, dev Device) Result { + cName := (*C.char)(unsafe.Pointer(name)) + cLen := (C.int)(len) + cDev := (C.CUdevice)(dev) + _ret := C.cuDeviceGetName(cName, cLen, cDev) + + return Result(_ret) +} diff --git a/pkg/nvidia-plugin/pkg/cuda/device.go b/pkg/nvidia-plugin/pkg/cuda/device.go new file mode 100644 index 000000000..ba8f71eb0 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cuda/device.go @@ -0,0 +1,17 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cuda diff --git a/pkg/nvidia-plugin/pkg/cuda/result.go b/pkg/nvidia-plugin/pkg/cuda/result.go new file mode 100644 index 000000000..ae2e1a9e0 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/cuda/result.go @@ -0,0 +1,178 @@ +/* +* SPDX-License-Identifier: Apache-2.0 +* +* The HAMi Contributors require contributions made to +* this file be licensed under the Apache-2.0 license or a +* compatible open source license. + */ + +/* +* Licensed to NVIDIA CORPORATION under one or more contributor +* license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright +* ownership. NVIDIA CORPORATION licenses this file to you under +* the Apache License, Version 2.0 (the "License"); you may +* not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +*     http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied.  See the License for the +* specific language governing permissions and limitations +* under the License. + */ + +/* +* Modifications Copyright The HAMi Authors. See +* GitHub history for details. + */ + +package cuda + +import ( + "fmt" +) + +// String returns the string representation of a Result +func (r Result) String() string { + return errorStringFunc(r) +} + +// Error returns the string representation of a Result +func (r Result) Error() string { + return r.String() +} + +var errorStringFunc = defaultErrorStringFunc + +var defaultErrorStringFunc = func(r Result) string { + switch r { + case SUCCESS: + return "CUDA_SUCCESS" + case ERROR_INVALID_VALUE: + return "CUDA_ERROR_INVALID_VALUE" + case ERROR_OUT_OF_MEMORY: + return "CUDA_ERROR_OUT_OF_MEMORY" + case ERROR_NOT_INITIALIZED: + return "CUDA_ERROR_NOT_INITIALIZED" + case ERROR_DEINITIALIZED: + return "CUDA_ERROR_DEINITIALIZED" + case ERROR_PROFILER_DISABLED: + return "CUDA_ERROR_PROFILER_DISABLED" + case ERROR_PROFILER_NOT_INITIALIZED: + return "CUDA_ERROR_PROFILER_NOT_INITIALIZED" + case ERROR_PROFILER_ALREADY_STARTED: + return "CUDA_ERROR_PROFILER_ALREADY_STARTED" + case ERROR_PROFILER_ALREADY_STOPPED: + return "CUDA_ERROR_PROFILER_ALREADY_STOPPED" + case ERROR_NO_DEVICE: + return "CUDA_ERROR_NO_DEVICE" + case ERROR_INVALID_DEVICE: + return "CUDA_ERROR_INVALID_DEVICE" + case ERROR_INVALID_IMAGE: + return "CUDA_ERROR_INVALID_IMAGE" + case ERROR_INVALID_CONTEXT: + return "CUDA_ERROR_INVALID_CONTEXT" + case ERROR_CONTEXT_ALREADY_CURRENT: + return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT" + case ERROR_MAP_FAILED: + return "CUDA_ERROR_MAP_FAILED" + case ERROR_UNMAP_FAILED: + return "CUDA_ERROR_UNMAP_FAILED" + case ERROR_ARRAY_IS_MAPPED: + return "CUDA_ERROR_ARRAY_IS_MAPPED" + case ERROR_ALREADY_MAPPED: + return "CUDA_ERROR_ALREADY_MAPPED" + case ERROR_NO_BINARY_FOR_GPU: + return "CUDA_ERROR_NO_BINARY_FOR_GPU" + case ERROR_ALREADY_ACQUIRED: + return "CUDA_ERROR_ALREADY_ACQUIRED" + case ERROR_NOT_MAPPED: + return "CUDA_ERROR_NOT_MAPPED" + case ERROR_NOT_MAPPED_AS_ARRAY: + return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY" + case ERROR_NOT_MAPPED_AS_POINTER: + return "CUDA_ERROR_NOT_MAPPED_AS_POINTER" + case ERROR_ECC_UNCORRECTABLE: + return "CUDA_ERROR_ECC_UNCORRECTABLE" + case ERROR_UNSUPPORTED_LIMIT: + return "CUDA_ERROR_UNSUPPORTED_LIMIT" + case ERROR_CONTEXT_ALREADY_IN_USE: + return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE" + case ERROR_PEER_ACCESS_UNSUPPORTED: + return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED" + case ERROR_INVALID_PTX: + return "CUDA_ERROR_INVALID_PTX" + case ERROR_INVALID_GRAPHICS_CONTEXT: + return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT" + case ERROR_NVLINK_UNCORRECTABLE: + return "CUDA_ERROR_NVLINK_UNCORRECTABLE" + case ERROR_JIT_COMPILER_NOT_FOUND: + return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND" + case ERROR_INVALID_SOURCE: + return "CUDA_ERROR_INVALID_SOURCE" + case ERROR_FILE_NOT_FOUND: + return "CUDA_ERROR_FILE_NOT_FOUND" + case ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: + return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND" + case ERROR_SHARED_OBJECT_INIT_FAILED: + return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED" + case ERROR_OPERATING_SYSTEM: + return "CUDA_ERROR_OPERATING_SYSTEM" + case ERROR_INVALID_HANDLE: + return "CUDA_ERROR_INVALID_HANDLE" + case ERROR_NOT_FOUND: + return "CUDA_ERROR_NOT_FOUND" + case ERROR_NOT_READY: + return "CUDA_ERROR_NOT_READY" + case ERROR_ILLEGAL_ADDRESS: + return "CUDA_ERROR_ILLEGAL_ADDRESS" + case ERROR_LAUNCH_OUT_OF_RESOURCES: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES" + case ERROR_LAUNCH_TIMEOUT: + return "CUDA_ERROR_LAUNCH_TIMEOUT" + case ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: + return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING" + case ERROR_PEER_ACCESS_ALREADY_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED" + case ERROR_PEER_ACCESS_NOT_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED" + case ERROR_PRIMARY_CONTEXT_ACTIVE: + return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE" + case ERROR_CONTEXT_IS_DESTROYED: + return "CUDA_ERROR_CONTEXT_IS_DESTROYED" + case ERROR_ASSERT: + return "CUDA_ERROR_ASSERT" + case ERROR_TOO_MANY_PEERS: + return "CUDA_ERROR_TOO_MANY_PEERS" + case ERROR_HOST_MEMORY_ALREADY_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED" + case ERROR_HOST_MEMORY_NOT_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED" + case ERROR_HARDWARE_STACK_ERROR: + return "CUDA_ERROR_HARDWARE_STACK_ERROR" + case ERROR_ILLEGAL_INSTRUCTION: + return "CUDA_ERROR_ILLEGAL_INSTRUCTION" + case ERROR_MISALIGNED_ADDRESS: + return "CUDA_ERROR_MISALIGNED_ADDRESS" + case ERROR_INVALID_ADDRESS_SPACE: + return "CUDA_ERROR_INVALID_ADDRESS_SPACE" + case ERROR_INVALID_PC: + return "CUDA_ERROR_INVALID_PC" + case ERROR_LAUNCH_FAILED: + return "CUDA_ERROR_LAUNCH_FAILED" + case ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: + return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE" + case ERROR_NOT_PERMITTED: + return "CUDA_ERROR_NOT_PERMITTED" + case ERROR_NOT_SUPPORTED: + return "CUDA_ERROR_NOT_SUPPORTED" + case ERROR_UNKNOWN: + return "CUDA_ERROR_UNKNOWN" + default: + return fmt.Sprintf("Unknown return value: %d", r) + } +} diff --git a/pkg/nvidia-plugin/pkg/dependencies/dependencies.go b/pkg/nvidia-plugin/pkg/dependencies/dependencies.go new file mode 100644 index 000000000..f793fea9f --- /dev/null +++ b/pkg/nvidia-plugin/pkg/dependencies/dependencies.go @@ -0,0 +1,7 @@ +//go:build dependencies +// +build dependencies + +// Package dependencies records dependencies. It cannot actually be compiled. +package dependencies + +import _ "github.com/NVIDIA/go-gpuallocator/gpuallocator" diff --git a/pkg/nvidia-plugin/pkg/flags/kubeclient.go b/pkg/nvidia-plugin/pkg/flags/kubeclient.go new file mode 100644 index 000000000..d0d105753 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/flags/kubeclient.go @@ -0,0 +1,114 @@ +/* + * Copyright 2023 The Kubernetes Authors. + * Copyright 2024 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package flags + +import ( + "fmt" + + "github.com/urfave/cli/v2" + + coreclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" +) + +type KubeClientConfig struct { + KubeConfig string + KubeAPIQPS float64 + KubeAPIBurst int +} + +type ClientSets struct { + Core coreclientset.Interface + NFD nfdclientset.Interface +} + +func (k *KubeClientConfig) Flags() []cli.Flag { + flags := []cli.Flag{ + &cli.StringFlag{ + Category: "Kubernetes client:", + Name: "kubeconfig", + Usage: "Absolute path to the `KUBECONFIG` file. Either this flag or the KUBECONFIG env variable need to be set if the driver is being run out of cluster.", + Destination: &k.KubeConfig, + EnvVars: []string{"KUBECONFIG"}, + }, + &cli.Float64Flag{ + Category: "Kubernetes client:", + Name: "kube-api-qps", + Usage: "`QPS` to use while communicating with the Kubernetes apiserver.", + Value: 5, + Destination: &k.KubeAPIQPS, + EnvVars: []string{"KUBE_API_QPS"}, + }, + &cli.IntFlag{ + Category: "Kubernetes client:", + Name: "kube-api-burst", + Usage: "`Burst` to use while communicating with the Kubernetes apiserver.", + Value: 10, + Destination: &k.KubeAPIBurst, + EnvVars: []string{"KUBE_API_BURST"}, + }, + } + + return flags +} + +func (k *KubeClientConfig) NewClientSetConfig() (*rest.Config, error) { + var csconfig *rest.Config + + var err error + if k.KubeConfig == "" { + csconfig, err = rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("create in-cluster client configuration: %w", err) + } + } else { + csconfig, err = clientcmd.BuildConfigFromFlags("", k.KubeConfig) + if err != nil { + return nil, fmt.Errorf("create out-of-cluster client configuration: %w", err) + } + } + + csconfig.QPS = float32(k.KubeAPIQPS) + csconfig.Burst = k.KubeAPIBurst + + return csconfig, nil +} + +func (k *KubeClientConfig) NewClientSets() (ClientSets, error) { + csconfig, err := k.NewClientSetConfig() + if err != nil { + return ClientSets{}, fmt.Errorf("create client configuration: %w", err) + } + + coreclient, err := coreclientset.NewForConfig(csconfig) + if err != nil { + return ClientSets{}, fmt.Errorf("create core client: %w", err) + } + + nfdclient, err := nfdclientset.NewForConfig(csconfig) + if err != nil { + return ClientSets{}, fmt.Errorf("create nfd client: %w", err) + } + + return ClientSets{ + Core: coreclient, + NFD: nfdclient, + }, nil +} diff --git a/pkg/nvidia-plugin/pkg/flags/node.go b/pkg/nvidia-plugin/pkg/flags/node.go new file mode 100644 index 000000000..8a38c98a8 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/flags/node.go @@ -0,0 +1,46 @@ +/* + * Copyright 2023 The Kubernetes Authors. + * Copyright 2024 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package flags + +import ( + "github.com/urfave/cli/v2" +) + +type NodeConfig struct { + Name string + Namespace string +} + +func (n *NodeConfig) Flags() []cli.Flag { + flags := []cli.Flag{ + &cli.StringFlag{ + Name: "namespace", + Usage: "The namespace used for the custom resources.", + Value: "default", + Destination: &n.Namespace, + EnvVars: []string{"NAMESPACE"}, + }, + &cli.StringFlag{ + Name: "node-name", + Usage: "The name of the node to be worked on.", + Destination: &n.Name, + EnvVars: []string{"NODE_NAME"}, + }, + } + return flags +} diff --git a/pkg/nvidia-plugin/pkg/imex/imex.go b/pkg/nvidia-plugin/pkg/imex/imex.go new file mode 100644 index 000000000..5b46d7baa --- /dev/null +++ b/pkg/nvidia-plugin/pkg/imex/imex.go @@ -0,0 +1,98 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package imex + +import ( + "errors" + "fmt" + "os" + "path/filepath" + + "k8s.io/klog/v2" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +// Channels represents a set of IMEX channels. +type Channels []*Channel + +// Channel represents an IMEX channel. +type Channel struct { + ID string + Path string + HostPath string +} + +// GetChannels returns the set of channels for the given config. +// If the selection of the default IMEX channel is disabled no channels are returned. +func GetChannels(config *spec.Config, devRoot string) (Channels, error) { + var channels Channels + for _, channelID := range config.Imex.ChannelIDs { + id := fmt.Sprintf("%d", channelID) + channelName := "channel" + id + path := filepath.Join("/dev/nvidia-caps-imex-channels", channelName) + channel := Channel{ + ID: id, + Path: path, + HostPath: filepath.Join(devRoot, path), + } + if exists, err := channel.exists(); !exists { + if config.Imex.Required { + return nil, errors.Join(err, fmt.Errorf("requested IMEX channel %v does not exist", channelName)) + } + klog.Warningf("Ignoring requested IMEX channel %v (%v)", channelName, err) + continue + } + klog.Infof("Selecting IMEX channel %v", channelName) + channels = append(channels, &channel) + } + return channels, nil +} + +// exists checks whether the IMEX channel exists. +// We check both the Path and HostPath since the location of the device node +// associated with the channel in the container is dependent on how it is +// injected. +// For example, if the host driver root is mounted at /driver-root the channel +// device node would be available at /driver-root/dev even if it was not +// injected into the container through any other mechanism. +// For the case of management containers using CDI to inject device nodes, these +// device nodes would exist at /dev in the container instead. +func (c Channel) exists() (bool, error) { + paths := []string{c.HostPath} + if c.HostPath != c.Path { + paths = append(paths, c.Path) + } + var errs error + for _, path := range paths { + info, err := os.Stat(path) + if os.IsNotExist(err) { + continue + } + if err != nil { + errs = errors.Join(errs, err) + continue + } + + if info.Mode()&os.ModeCharDevice == 0 { + errs = errors.Join(errs, fmt.Errorf("%v is not a character device", path)) + continue + } + return true, nil + } + return false, errs +} diff --git a/pkg/nvidia-plugin/pkg/info/version.go b/pkg/nvidia-plugin/pkg/info/version.go new file mode 100644 index 000000000..231523590 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/info/version.go @@ -0,0 +1,48 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package info + +import "strings" + +// version must be set by go build's -X main.version= option in the Makefile. +var version = "unknown" + +// gitCommit will be the hash that the binary was built from +// and will be populated by the Makefile. +var gitCommit = "" + +// GetVersionParts returns the different version components. +func GetVersionParts() []string { + v := []string{version} + + if gitCommit != "" { + v = append(v, "commit: "+gitCommit) + } + + return v +} + +// GetVersionString returns the string representation of the version. +func GetVersionString(more ...string) string { + v := append(GetVersionParts(), more...) + return strings.Join(v, "\n") +} + +// GetVersion returns the version of the binary. +func GetVersion() string { + return version +} diff --git a/pkg/nvidia-plugin/pkg/lm/empty.go b/pkg/nvidia-plugin/pkg/lm/empty.go new file mode 100644 index 000000000..b2a75dd52 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/empty.go @@ -0,0 +1,24 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +// empty represents an empty set of labels +type empty struct{} + +func (manager empty) Labels() (Labels, error) { + return nil, nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/imex.go b/pkg/nvidia-plugin/pkg/lm/imex.go new file mode 100644 index 000000000..2c692a9fa --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/imex.go @@ -0,0 +1,182 @@ +/** +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "bufio" + "errors" + "fmt" + "io" + "net" + "os" + "path/filepath" + "sort" + "strings" + + "github.com/google/uuid" + "k8s.io/klog/v2" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" +) + +const ( + // ImexNodesConfigFilePath is the path to the IMEX nodes config file. + // This file contains a list of IP addresses of the nodes in the IMEX domain. + ImexNodesConfigFilePath = "/etc/nvidia-imex/nodes_config.cfg" +) + +func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) { + var errs error + for _, root := range imexNodesConfigFilePathSearchRoots(config) { + configFilePath := filepath.Join(root, ImexNodesConfigFilePath) + imexLabeler, err := imexLabelerForConfigFile(configFilePath, devices) + if err != nil { + errs = errors.Join(errs, err) + continue + } + if imexLabeler != nil { + klog.Infof("Using labeler for IMEX config %v", configFilePath) + return imexLabeler, nil + } + } + if errs != nil { + return nil, errs + } + + return empty{}, nil +} + +// imexNodesConfigFilePathSearchRoots returns a list of roots to search for the IMEX nodes config file. +func imexNodesConfigFilePathSearchRoots(config *spec.Config) []string { + // By default, search / and /config for config files. + roots := []string{"/", "/config"} + + if config == nil || config.Flags.Plugin == nil || config.Flags.Plugin.ContainerDriverRoot == nil { + return roots + } + + // If a driver root is specified, it is also searched. + return append(roots, *config.Flags.Plugin.ContainerDriverRoot) +} + +func imexLabelerForConfigFile(configFilePath string, devices []resource.Device) (Labeler, error) { + imexConfigFile, err := os.Open(configFilePath) + if os.IsNotExist(err) { + // No imex config file, return empty labels + return nil, nil + } else if err != nil { + return nil, fmt.Errorf("failed to open imex config file: %v", err) + } + defer imexConfigFile.Close() + + clusterUUID, cliqueID, err := getFabricIDs(devices) + if err != nil { + return nil, err + } + if clusterUUID == "" || cliqueID == "" { + return nil, nil + } + + imexDomainID, err := getImexDomainID(imexConfigFile) + if err != nil { + return nil, err + } + if imexDomainID == "" { + return nil, nil + } + + labels := Labels{ + "nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."), + "nvidia.com/gpu.imex-domain": strings.Join([]string{imexDomainID, cliqueID}, "."), + } + + return labels, nil +} + +func getFabricIDs(devices []resource.Device) (string, string, error) { + uniqueClusterUUIDs := make(map[string][]int) + uniqueCliqueIDs := make(map[string][]int) + for i, device := range devices { + isFabricAttached, err := device.IsFabricAttached() + if err != nil { + return "", "", fmt.Errorf("error checking imex capability: %v", err) + } + if !isFabricAttached { + continue + } + + clusterUUID, cliqueID, err := device.GetFabricIDs() + if err != nil { + + return "", "", fmt.Errorf("error getting fabric IDs: %w", err) + } + + uniqueClusterUUIDs[clusterUUID] = append(uniqueClusterUUIDs[clusterUUID], i) + uniqueCliqueIDs[cliqueID] = append(uniqueCliqueIDs[cliqueID], i) + } + + if len(uniqueClusterUUIDs) > 1 { + klog.Warningf("Cluster UUIDs are non-unique: %v", uniqueClusterUUIDs) + return "", "", nil + } + + if len(uniqueCliqueIDs) > 1 { + klog.Warningf("Clique IDs are non-unique: %v", uniqueCliqueIDs) + return "", "", nil + } + + for clusterUUID := range uniqueClusterUUIDs { + for cliqueID := range uniqueCliqueIDs { + return clusterUUID, cliqueID, nil + } + } + return "", "", nil +} + +// getImexDomainID reads the imex config file and returns a unique identifier +// based on the sorted list of IP addresses in the file. +func getImexDomainID(r io.Reader) (string, error) { + // Read the file line by line + var ips []string + scanner := bufio.NewScanner(r) + for scanner.Scan() { + ip := strings.TrimSpace(scanner.Text()) + if net.ParseIP(ip) == nil { + return "", fmt.Errorf("invalid IP address in imex config file: %s", ip) + } + ips = append(ips, ip) + } + + if err := scanner.Err(); err != nil { + return "", fmt.Errorf("failed to read imex config file: %v", err) + } + + if len(ips) == 0 { + // No IPs in the file, return empty labels + return "", nil + } + + sort.Strings(ips) + + return generateContentUUID(strings.Join(ips, "\n")), nil + +} + +func generateContentUUID(seed string) string { + return uuid.NewSHA1(uuid.Nil, []byte(seed)).String() +} diff --git a/pkg/nvidia-plugin/pkg/lm/imex_test.go b/pkg/nvidia-plugin/pkg/lm/imex_test.go new file mode 100644 index 000000000..553976465 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/imex_test.go @@ -0,0 +1,57 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGerenerateDomainUUID(t *testing.T) { + testCases := []struct { + description string + ips []string + expected string + }{ + { + description: "single IP", + ips: []string{"10.130.3.24"}, + expected: "60ad7226-0130-54d0-b762-2a5385a3a26f", + }, + { + description: "multiple IPs", + ips: []string{ + "10.130.3.24", + "10.130.3.53", + "10.130.3.23", + "10.130.3.31", + "10.130.3.27", + "10.130.3.25", + }, + expected: "8a7363e9-1003-5814-9354-175fdff19204", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + id := generateContentUUID(strings.Join(tc.ips, "\n")) + require.Equal(t, tc.expected, id) + }) + } +} diff --git a/pkg/nvidia-plugin/pkg/lm/labeler.go b/pkg/nvidia-plugin/pkg/lm/labeler.go new file mode 100644 index 000000000..d1f41341a --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/labeler.go @@ -0,0 +1,45 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "fmt" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/vgpu" +) + +// Labeler defines an interface for generating labels +type Labeler interface { + Labels() (Labels, error) +} + +// NewLabelers constructs the required labelers from the specified config +func NewLabelers(manager resource.Manager, vgpu vgpu.Interface, config *spec.Config) (Labeler, error) { + deviceLabeler, err := NewDeviceLabeler(manager, config) + if err != nil { + return nil, fmt.Errorf("error creating labeler: %v", err) + } + + l := Merge( + deviceLabeler, + NewVGPULabeler(vgpu), + ) + + return l, nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/labels.go b/pkg/nvidia-plugin/pkg/lm/labels.go new file mode 100644 index 000000000..8283b6a3a --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/labels.go @@ -0,0 +1,25 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +// Labels defines a type for labels +type Labels map[string]string + +// Labels also implements the Labeler interface +func (labels Labels) Labels() (Labels, error) { + return labels, nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/list.go b/pkg/nvidia-plugin/pkg/lm/list.go new file mode 100644 index 000000000..decf0ee98 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/list.go @@ -0,0 +1,46 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import "fmt" + +// list represents a list of labelers that iself implements the Labeler interface. +type list []Labeler + +// Merge converts a set of labelers to a single composite labeler. +func Merge(labelers ...Labeler) Labeler { + l := list(labelers) + + return l +} + +// Labels returns the labels from a set of labelers. Labels later in the list +// overwrite earlier labels. +func (labelers list) Labels() (Labels, error) { + allLabels := make(Labels) + for _, labeler := range labelers { + labels, err := labeler.Labels() + if err != nil { + return nil, fmt.Errorf("error generating labels: %v", err) + } + for k, v := range labels { + allLabels[k] = v + } + } + + return allLabels, nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/machine-type.go b/pkg/nvidia-plugin/pkg/lm/machine-type.go new file mode 100644 index 000000000..887c180b6 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/machine-type.go @@ -0,0 +1,53 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "fmt" + "os" + "strings" + + "k8s.io/klog/v2" +) + +const ( + machineTypeUnknown = "unknown" +) + +func newMachineTypeLabeler(machineTypePath string) (Labeler, error) { + machineType, err := getMachineType(machineTypePath) + if err != nil { + klog.Warningf("Error getting machine type from %v: %v", machineTypePath, err) + machineType = machineTypeUnknown + } + l := Labels{ + "nvidia.com/gpu.machine": sanitise(machineType), + } + + return l, nil +} + +func getMachineType(path string) (string, error) { + if path == "" { + return machineTypeUnknown, nil + } + data, err := os.ReadFile(path) + if err != nil { + return "", fmt.Errorf("could not open machine type file: %v", err) + } + return strings.TrimSpace(string(data)), nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/mig-strategy.go b/pkg/nvidia-plugin/pkg/lm/mig-strategy.go new file mode 100644 index 000000000..f77d40ea9 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/mig-strategy.go @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package lm + +import ( + "fmt" + + "k8s.io/klog/v2" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/mig" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" +) + +// Constants representing different MIG strategies. +const ( + MigStrategyNone = "none" + MigStrategySingle = "single" + MigStrategyMixed = "mixed" +) + +// migResource is used to track MIG devices for labelling under the single and mixed strategies. +// This allows a particular resource name to be associated with an resource.Device and count. +type migResource struct { + name spec.ResourceName + device resource.Device + count int +} + +// NewResourceLabeler creates a labeler for available GPU resources. +// These include full GPU labels as well as labels specific to the mig-strategy specified. +func NewResourceLabeler(manager resource.Manager, config *spec.Config) (Labeler, error) { + devices, err := manager.GetDevices() + if err != nil { + return nil, fmt.Errorf("error getting devices: %v", err) + } + + // If no GPUs are detected, we return an empty labeler + if len(devices) == 0 { + return empty{}, nil + } + + fullGPULabeler, err := newGPULabelers(manager, config) + if err != nil { + return nil, fmt.Errorf("failed to construct GPU labeler: %v", err) + } + + if *config.Flags.MigStrategy == spec.MigStrategyNone { + return fullGPULabeler, nil + } + + migLabeler, err := newMigLabeler(manager, config) + if err != nil { + return nil, fmt.Errorf("failed to construct MIG resource labeler: %v", err) + } + + labelers := Merge( + fullGPULabeler, + migLabeler, + ) + + return labelers, nil + +} + +// MigDeviceCounts maintains a count of unique MIG device types across all GPUs on a node +type MigDeviceCounts map[string]int + +// newMigLabeler creates a labeler for MIG devices. +// The labeler created depends on the migStrategy. +func newMigLabeler(manager resource.Manager, config *spec.Config) (Labeler, error) { + var err error + var labeler Labeler + switch *config.Flags.MigStrategy { + case MigStrategyNone: + labeler = empty{} + case MigStrategySingle: + labeler, err = newMigStrategySingleLabeler(manager, config) + if err != nil { + return nil, fmt.Errorf("failed to create labeler for mig-strategy=single: %v", err) + } + case MigStrategyMixed: + labeler, err = newMigStrategyMixedLabeler(manager, config) + if err != nil { + return nil, fmt.Errorf("failed to create labeler for mig-strategy=mixed: %v", err) + } + default: + return nil, fmt.Errorf("unknown strategy: %v", *config.Flags.MigStrategy) + } + + labelers := Merge( + migStrategyLabeler(*config.Flags.MigStrategy), + labeler, + ) + + return labelers, nil +} + +// newGPULabelers creates a set of labelers for full GPUs +func newGPULabelers(manager resource.Manager, config *spec.Config) (Labeler, error) { + deviceInfo := mig.NewDeviceInfo(manager) + + devicesByMigEnabled, err := deviceInfo.GetDevicesMap() + if err != nil { + return nil, fmt.Errorf("error getting map of devices: %v", err) + } + + if len(devicesByMigEnabled) == 0 { + return nil, fmt.Errorf("no GPU devices detected") + } + + counts := make(map[string]int) + migEnabledDevices := make(map[string]resource.Device) + for _, device := range devicesByMigEnabled[true] { + name, err := device.GetName() + if err != nil { + return nil, fmt.Errorf("error getting device name: %v", err) + } + migEnabledDevices[name] = device + counts[name]++ + } + + fullGPUs := make(map[string]resource.Device) + for _, device := range devicesByMigEnabled[false] { + name, err := device.GetName() + if err != nil { + return nil, fmt.Errorf("error getting device name: %v", err) + } + fullGPUs[name] = device + counts[name]++ + } + + if len(counts) > 1 { + var names []string + for n := range counts { + names = append(names, n) + } + klog.Warningf("Multiple device types detected: %v", names) + } + + var labelers list + // We construct labelers for the MIG-enabled resources. + // These do not include sharing information. + for name, migEnabledDevice := range migEnabledDevices { + // We generate a resource label with no sharing modifications + l, err := NewGPUResourceLabelerWithoutSharing(migEnabledDevice, counts[name]) + if err != nil { + return nil, fmt.Errorf("failed to construct labeler: %v", err) + } + + labelers = append(labelers, l) + } + + // We construct labelers for the full GPUs. + // These override any resources with the same name that have MIG enabled. + for name, fullGPU := range fullGPUs { + l, err := NewGPUResourceLabeler(config, fullGPU, counts[name]) + if err != nil { + return nil, fmt.Errorf("failed to construct labeler: %v", err) + } + + labelers = append(labelers, l) + } + + return labelers.Labels() +} + +func newMigStrategySingleLabeler(manager resource.Manager, config *spec.Config) (Labeler, error) { + deviceInfo := mig.NewDeviceInfo(manager) + migEnabledDevices, err := deviceInfo.GetDevicesWithMigEnabled() + if err != nil { + return nil, fmt.Errorf("unabled to retrieve list of MIG-enabled devices: %v", err) + } + // No devices have migEnabled=true. This is equivalent to the `none` MIG strategy + if len(migEnabledDevices) == 0 { + return empty{}, nil + } + + hasEmpty, err := deviceInfo.AnyMigEnabledDeviceIsEmpty() + if err != nil { + return nil, fmt.Errorf("failed to check for empty MIG-enabled devices: %v", err) + } + // If any migEnabled=true device is empty, we return the set of mig-strategy-invalid labels. + if hasEmpty { + return newInvalidMigStrategyLabeler(migEnabledDevices[0], "at least one MIG device is enabled but empty") + } + + migDisabledDevices, err := deviceInfo.GetDevicesWithMigDisabled() + if err != nil { + return nil, fmt.Errorf("unabled to retrieve list of non-MIG-enabled devices: %v", err) + } + // If we have a mix of mig-enabled and mig-disabled device we return the set of mig-strategy-invalid labels + if len(migDisabledDevices) != 0 { + return newInvalidMigStrategyLabeler(migEnabledDevices[0], "devices with MIG enabled and disable detected") + } + + migs, err := deviceInfo.GetAllMigDevices() + if err != nil { + return nil, fmt.Errorf("unable to retrieve list of MIG devices: %v", err) + } + + // Add new MIG related labels on each individual MIG type + resources := make(map[string]migResource) + for _, mig := range migs { + name, err := mig.GetName() + if err != nil { + return nil, fmt.Errorf("unable to get MIG device name: %v", err) + } + + resource, exists := resources[name] + // For the first occurrence we update the device reference and the resource name + if !exists { + resource.device = mig + resource.name = fullGPUResourceName + } + // We increase the count + resource.count++ + + resources[name] = resource + } + + // Multiple resources mean that we have more than one MIG profile defined. Return the set of mig-strategy-invalid labels. + if len(resources) != 1 { + return newInvalidMigStrategyLabeler(migEnabledDevices[0], "more than one MIG device type present on node") + } + + return newMIGDeviceLabelers(resources, config) +} + +func newInvalidMigStrategyLabeler(device resource.Device, reason string) (Labeler, error) { + klog.Warningf("Invalid configuration detected for mig-strategy=single: %v", reason) + + model, err := device.GetName() + if err != nil { + return nil, fmt.Errorf("failed to get device model: %v", err) + } + + rl := resourceLabeler{ + resourceName: "nvidia.com/gpu", + } + + labels := rl.productLabel(model, "MIG", "INVALID") + + rl.updateLabel(labels, "count", 0) + rl.updateLabel(labels, "replicas", 0) + rl.updateLabel(labels, "sharing-strategy", "") + rl.updateLabel(labels, "memory", 0) + + return labels, nil +} + +func newMigStrategyMixedLabeler(manager resource.Manager, config *spec.Config) (Labeler, error) { + deviceInfo := mig.NewDeviceInfo(manager) + + // Enumerate the MIG devices on this node. In mig.strategy=mixed we ignore devices + // configured with migEnabled=true but exposing no MIG devices. + migs, err := deviceInfo.GetAllMigDevices() + if err != nil { + return nil, fmt.Errorf("unable to retrieve list of MIG devices: %v", err) + } + + // Add new MIG related labels on each individual MIG type + resources := make(map[string]migResource) + for _, mig := range migs { + name, err := mig.GetName() + if err != nil { + return nil, fmt.Errorf("unable to get MIG device name: %v", err) + } + + resource, exists := resources[name] + // For the first occurrence we update the device reference and the resource name + if !exists { + resource.device = mig + resource.name = spec.ResourceName("nvidia.com/mig-" + name) + } + // We increase the count + resource.count++ + + resources[name] = resource + } + + return newMIGDeviceLabelers(resources, config) +} + +func newMIGDeviceLabelers(resources map[string]migResource, config *spec.Config) (Labeler, error) { + var labelers list + for _, resource := range resources { + l, err := NewMIGResourceLabeler(resource.name, config, resource.device, resource.count) + if err != nil { + return nil, fmt.Errorf("failed to construct labeler: %v", err) + } + + labelers = append(labelers, l) + } + + return labelers, nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/mig-strategy_test.go b/pkg/nvidia-plugin/pkg/lm/mig-strategy_test.go new file mode 100644 index 000000000..0897a0b0b --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/mig-strategy_test.go @@ -0,0 +1,422 @@ +/** +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "testing" + + "github.com/stretchr/testify/require" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" + rt "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource/testing" +) + +func TestMigStrategyNoneLabels(t *testing.T) { + testCases := []struct { + description string + devices []resource.Device + timeSlicing spec.ReplicatedResources + expectedError bool + expectedLabels Labels + }{ + { + description: "no devices returns empty labels", + }, + { + description: "single non-mig device returns non-mig (none) labels", + devices: []resource.Device{ + rt.NewFullGPU(), + }, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + }, + }, + { + description: "sharing is applied to single device", + devices: []resource.Device{ + rt.NewFullGPU(), + }, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "time-slicing", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-SHARED", + }, + }, + { + description: "sharing is applied to multiple devices", + devices: []resource.Device{ + rt.NewFullGPU(), + rt.NewFullGPU(), + }, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "2", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "time-slicing", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-SHARED", + }, + }, + { + description: "sharing is not applied to single MIG device; replicas is zero", + devices: []resource.Device{ + rt.NewMigEnabledDevice(), + }, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "0", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + }, + }, + { + description: "sharing is not applied to multiple MIG device; replicas is zero", + devices: []resource.Device{ + rt.NewMigEnabledDevice(), + rt.NewMigEnabledDevice(), + }, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "2", + "nvidia.com/gpu.replicas": "0", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + }, + }, + { + description: "sharing is applied to MIG device and non-MIG device", + devices: []resource.Device{ + rt.NewMigEnabledDevice(), + rt.NewFullGPU(), + }, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "2", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "time-slicing", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-SHARED", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + nvmlMock := rt.NewManagerMockWithDevices(tc.devices...) + + config := spec.Config{ + Flags: spec.Flags{ + CommandLineFlags: spec.CommandLineFlags{ + MigStrategy: ptr(MigStrategyNone), + }, + }, + Sharing: spec.Sharing{ + TimeSlicing: tc.timeSlicing, + }, + } + + none, _ := NewResourceLabeler(nvmlMock, &config) + + labels, err := none.Labels() + if tc.expectedError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + require.EqualValues(t, tc.expectedLabels, labels) + }) + } +} + +func TestMigStrategySingleLabels(t *testing.T) { + testCases := []struct { + description string + devices []resource.Device + expectedError bool + expectedLabels Labels + isInvalid bool + }{ + { + description: "no devices returns empty labels", + }, + { + description: "single non-mig device returns non-mig (none) labels", + devices: []resource.Device{ + rt.NewFullGPU(), + }, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + "nvidia.com/mig.strategy": "single", + }, + }, + { + description: "multiple non-mig device returns non-mig (none) labels", + devices: []resource.Device{ + rt.NewFullGPU(), + rt.NewFullGPU(), + }, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "2", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + "nvidia.com/mig.strategy": "single", + }, + }, + { + description: "single mig-enabled device returns mig labels", + devices: []resource.Device{ + rt.NewMigEnabledDevice( + rt.NewMigDevice(1, 2, 100), + ), + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "100", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-1g.100gb", + "nvidia.com/mig.strategy": "single", + "nvidia.com/gpu.multiprocessors": "0", + "nvidia.com/gpu.slices.gi": "1", + "nvidia.com/gpu.slices.ci": "2", + "nvidia.com/gpu.engines.copy": "0", + "nvidia.com/gpu.engines.decoder": "0", + "nvidia.com/gpu.engines.encoder": "0", + "nvidia.com/gpu.engines.jpeg": "0", + "nvidia.com/gpu.engines.ofa": "0", + }, + }, + { + description: "multiple mig-enabled devices returns mig labels", + devices: []resource.Device{ + rt.NewMigEnabledDevice( + rt.NewMigDevice(1, 2, 100, map[string]interface{}{ + "multiprocessors": 12, + "engines.copy": 13, + "engines.decoder": 14, + "engines.encoder": 15, + "engines.jpeg": 16, + "engines.ofa": 17, + }), + ), + rt.NewMigEnabledDevice( + rt.NewMigDevice(1, 2, 100, map[string]interface{}{ + "multiprocessors": 12, + "engines.copy": 13, + "engines.decoder": 14, + "engines.encoder": 15, + "engines.jpeg": 16, + "engines.ofa": 17, + }), + ), + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "2", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "100", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-1g.100gb", + "nvidia.com/mig.strategy": "single", + "nvidia.com/gpu.multiprocessors": "12", + "nvidia.com/gpu.slices.gi": "1", + "nvidia.com/gpu.slices.ci": "2", + "nvidia.com/gpu.engines.copy": "13", + "nvidia.com/gpu.engines.decoder": "14", + "nvidia.com/gpu.engines.encoder": "15", + "nvidia.com/gpu.engines.jpeg": "16", + "nvidia.com/gpu.engines.ofa": "17", + }, + }, + { + description: "empty mig devices returns MIG invalid label", + devices: []resource.Device{ + rt.NewMigEnabledDevice(), + }, + isInvalid: true, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "0", + "nvidia.com/gpu.replicas": "0", + "nvidia.com/gpu.sharing-strategy": "", + "nvidia.com/gpu.memory": "0", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-INVALID", + "nvidia.com/mig.strategy": "single", + }, + }, + { + description: "mixed mig config returns MIG invalid label", + devices: []resource.Device{ + rt.NewMigEnabledDevice( + rt.NewMigDevice(1, 2, 100), + rt.NewMigDevice(3, 4, 100), + ), + }, + isInvalid: true, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "0", + "nvidia.com/gpu.replicas": "0", + "nvidia.com/gpu.sharing-strategy": "", + "nvidia.com/gpu.memory": "0", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-INVALID", + "nvidia.com/mig.strategy": "single", + }, + }, + { + description: "mixed mig enabled and disabled returns invalid config", + devices: []resource.Device{ + rt.NewMigEnabledDevice( + rt.NewMigDevice(1, 2, 100), + ), + rt.NewFullGPU(), + }, + isInvalid: true, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "0", + "nvidia.com/gpu.replicas": "0", + "nvidia.com/gpu.sharing-strategy": "", + "nvidia.com/gpu.memory": "0", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-INVALID", + "nvidia.com/mig.strategy": "single", + }, + }, + { + description: "enabled, disabled, and empty returns invalid config", + devices: []resource.Device{ + rt.NewMigEnabledDevice( + rt.NewMigDevice(1, 2, 100), + ), + rt.NewFullGPU(), + rt.NewMigEnabledDevice(), + }, + isInvalid: true, + expectedLabels: Labels{ + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.count": "0", + "nvidia.com/gpu.replicas": "0", + "nvidia.com/gpu.sharing-strategy": "", + "nvidia.com/gpu.memory": "0", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-INVALID", + "nvidia.com/mig.strategy": "single", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + nvmlMock := rt.NewManagerMockWithDevices(tc.devices...) + + config := spec.Config{ + Flags: spec.Flags{ + CommandLineFlags: spec.CommandLineFlags{ + MigStrategy: ptr(MigStrategySingle), + }, + }, + } + + single, _ := NewResourceLabeler(nvmlMock, &config) + + labels, err := single.Labels() + if tc.expectedError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + require.EqualValues(t, tc.expectedLabels, labels) + }) + } +} + +// prt returns a reference to whatever type is passed into it +func ptr[T any](x T) *T { + return &x +} diff --git a/pkg/nvidia-plugin/pkg/lm/nvml.go b/pkg/nvidia-plugin/pkg/lm/nvml.go new file mode 100644 index 000000000..77bf7a817 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/nvml.go @@ -0,0 +1,262 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "errors" + "fmt" + "strconv" + "strings" + + "k8s.io/klog/v2" + + "github.com/NVIDIA/go-nvlib/pkg/nvpci" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" +) + +var errMPSSharingNotSupported = errors.New("MPS sharing is not supported") + +// NewDeviceLabeler creates a new labeler for the specified resource manager. +func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, error) { + if err := manager.Init(); err != nil { + return nil, fmt.Errorf("failed to initialize resource manager: %v", err) + } + defer func() { + _ = manager.Shutdown() + }() + + devices, err := manager.GetDevices() + if err != nil { + return nil, fmt.Errorf("error getting devices: %v", err) + } + + if len(devices) == 0 { + return empty{}, nil + } + + machineTypeLabeler, err := newMachineTypeLabeler(*config.Flags.GFD.MachineTypeFile) + if err != nil { + return nil, fmt.Errorf("failed to construct machine type labeler: %v", err) + } + + versionLabeler, err := newVersionLabeler(manager) + if err != nil { + return nil, fmt.Errorf("failed to construct version labeler: %v", err) + } + + migCapabilityLabeler, err := newMigCapabilityLabeler(manager) + if err != nil { + return nil, fmt.Errorf("error creating mig capability labeler: %v", err) + } + + sharingLabeler, err := newSharingLabeler(manager, config) + if err != nil { + return nil, fmt.Errorf("error creating sharing labeler: %w", err) + } + + resourceLabeler, err := NewResourceLabeler(manager, config) + if err != nil { + return nil, fmt.Errorf("error creating resource labeler: %v", err) + } + + gpuModeLabeler, err := newGPUModeLabeler(devices) + if err != nil { + return nil, fmt.Errorf("error creating resource labeler: %v", err) + } + + imexLabeler, err := newImexLabeler(config, devices) + if err != nil { + return nil, fmt.Errorf("error creating IMEX labeler: %v", err) + } + + l := Merge( + machineTypeLabeler, + versionLabeler, + migCapabilityLabeler, + sharingLabeler, + resourceLabeler, + gpuModeLabeler, + imexLabeler, + ) + + return l, nil +} + +// newVersionLabeler creates a labeler that generates the CUDA and driver version labels. +func newVersionLabeler(manager resource.Manager) (Labeler, error) { + driverVersion, err := manager.GetDriverVersion() + if err != nil { + return nil, fmt.Errorf("error getting driver version: %v", err) + } + + driverVersionSplit := strings.Split(driverVersion, ".") + if len(driverVersionSplit) > 3 || len(driverVersionSplit) < 2 { + return nil, fmt.Errorf("error getting driver version: Version \"%s\" does not match format \"X.Y[.Z]\"", driverVersion) + } + + driverMajor := driverVersionSplit[0] + driverMinor := driverVersionSplit[1] + driverRev := "" + if len(driverVersionSplit) > 2 { + driverRev = driverVersionSplit[2] + } + + cudaMajor, cudaMinor, err := manager.GetCudaDriverVersion() + if err != nil { + return nil, fmt.Errorf("error getting cuda driver version: %v", err) + } + + labels := Labels{ + // Deprecated labels + "nvidia.com/cuda.driver.major": driverMajor, + "nvidia.com/cuda.driver.minor": driverMinor, + "nvidia.com/cuda.driver.rev": driverRev, + "nvidia.com/cuda.runtime.major": fmt.Sprintf("%d", cudaMajor), + "nvidia.com/cuda.runtime.minor": fmt.Sprintf("%d", cudaMinor), + + // New labels + "nvidia.com/cuda.driver-version.major": driverMajor, + "nvidia.com/cuda.driver-version.minor": driverMinor, + "nvidia.com/cuda.driver-version.revision": driverRev, + "nvidia.com/cuda.driver-version.full": driverVersion, + "nvidia.com/cuda.runtime-version.major": fmt.Sprintf("%d", cudaMajor), + "nvidia.com/cuda.runtime-version.minor": fmt.Sprintf("%d", cudaMinor), + "nvidia.com/cuda.runtime-version.full": fmt.Sprintf("%d.%d", cudaMajor, cudaMinor), + } + return labels, nil +} + +// newMigCapabilityLabeler creates a new MIG capability labeler using the provided NVML library. +// If any GPU on the node is mig-capable the label is set to true. +func newMigCapabilityLabeler(manager resource.Manager) (Labeler, error) { + isMigCapable := false + + devices, err := manager.GetDevices() + if err != nil { + return nil, err + } + if len(devices) == 0 { + // no devices, return empty labels + return empty{}, nil + } + + // loop through all devices to check if any one of them is MIG capable + for _, d := range devices { + isMigCapable, err = d.IsMigCapable() + if err != nil { + return nil, fmt.Errorf("error getting mig capability: %v", err) + } + if isMigCapable { + break + } + } + + labels := Labels{ + "nvidia.com/mig.capable": strconv.FormatBool(isMigCapable), + } + return labels, nil +} + +func newSharingLabeler(manager resource.Manager, config *spec.Config) (Labeler, error) { + if config == nil || config.Sharing.SharingStrategy() != spec.SharingStrategyMPS { + labels := Labels{ + "nvidia.com/mps.capable": "false", + } + return labels, nil + } + + capable, err := isMPSCapable(manager) + if err != nil { + return nil, fmt.Errorf("failed to check MPS-capable: %w", err) + } + + labels := Labels{ + "nvidia.com/mps.capable": strconv.FormatBool(capable), + } + return labels, nil +} + +func isMPSCapable(manager resource.Manager) (bool, error) { + devices, err := manager.GetDevices() + if err != nil { + return false, fmt.Errorf("failed to get device: %w", err) + } + + for _, d := range devices { + isMigEnabled, err := d.IsMigEnabled() + if err != nil { + return false, fmt.Errorf("failed to check if device is MIG-enabled: %w", err) + } + if isMigEnabled { + return false, fmt.Errorf("%w for mig devices", errMPSSharingNotSupported) + } + } + return true, nil +} + +// newGPUModeLabeler creates a new labeler that reports the mode of GPUs on the node. +// GPUs can be in Graphics or Compute mode. +func newGPUModeLabeler(devices []resource.Device) (Labeler, error) { + classes, err := getDeviceClasses(devices) + if err != nil { + return nil, err + } + gpuMode := getModeForClasses(classes) + labels := Labels{ + "nvidia.com/gpu.mode": gpuMode, + } + return labels, nil +} + +func getModeForClasses(classes []uint32) string { + if len(classes) == 0 { + return "unknown" + } + for _, class := range classes { + if class != classes[0] { + klog.Infof("Not all GPU devices belong to the same class %#06x ", classes) + return "unknown" + } + } + switch classes[0] { + case nvpci.PCIVgaControllerClass: + return "graphics" + case nvpci.PCI3dControllerClass: + return "compute" + default: + return "unknown" + } +} + +func getDeviceClasses(devices []resource.Device) ([]uint32, error) { + seenClasses := make(map[uint32]bool) + for _, d := range devices { + class, err := d.GetPCIClass() + if err != nil { + return nil, err + } + seenClasses[class] = true + } + + var classes []uint32 + for class := range seenClasses { + classes = append(classes, class) + } + return classes, nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/nvml_test.go b/pkg/nvidia-plugin/pkg/lm/nvml_test.go new file mode 100644 index 000000000..fb6fa4793 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/nvml_test.go @@ -0,0 +1,292 @@ +package lm + +import ( + "testing" + + "github.com/stretchr/testify/require" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" + rt "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource/testing" +) + +func TestMigCapabilityLabeler(t *testing.T) { + testCases := []struct { + description string + devices []resource.Device + expectedError bool + expectedLabels map[string]string + }{ + { + description: "no devices returns empty labels", + }, + { + description: "single non-mig capable device returns mig.capable as false", + devices: []resource.Device{ + rt.NewFullGPU(), + }, + expectedLabels: map[string]string{ + "nvidia.com/mig.capable": "false", + }, + }, + { + description: "multiple non-mig capable devices returns mig.capable as false", + devices: []resource.Device{ + rt.NewFullGPU(), + rt.NewFullGPU(), + }, + expectedLabels: map[string]string{ + "nvidia.com/mig.capable": "false", + }, + }, + { + description: "single mig capable device returns mig.capable as true", + devices: []resource.Device{ + rt.NewMigEnabledDevice(), + }, + expectedLabels: map[string]string{ + "nvidia.com/mig.capable": "true", + }, + }, + { + description: "one mig capable device among multiple returns mig.capable as true", + devices: []resource.Device{ + rt.NewFullGPU(), + rt.NewMigEnabledDevice(), + }, + expectedLabels: map[string]string{ + "nvidia.com/mig.capable": "true", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + nvmlMock := rt.NewManagerMockWithDevices(tc.devices...) + + migCapabilityLabeler, _ := newMigCapabilityLabeler(nvmlMock) + + labels, err := migCapabilityLabeler.Labels() + if tc.expectedError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + require.EqualValues(t, tc.expectedLabels, labels) + }) + } +} + +func TestSharingLabeler(t *testing.T) { + testCases := []struct { + description string + manager resource.Manager + config *spec.Config + expectedLabels map[string]string + expectedError error + }{ + { + description: "nil config", + expectedLabels: map[string]string{ + "nvidia.com/mps.capable": "false", + }, + }, + { + description: "empty config", + config: &spec.Config{}, + expectedLabels: map[string]string{ + "nvidia.com/mps.capable": "false", + }, + }, + { + description: "config with timeslicing replicas", + config: &spec.Config{ + Sharing: spec.Sharing{ + TimeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Replicas: 2, + }, + }, + }, + }, + }, + expectedLabels: map[string]string{ + "nvidia.com/mps.capable": "false", + }, + }, + { + description: "config with no mps replicas", + config: &spec.Config{ + Sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Replicas: 1, + }, + }, + }, + }, + }, + expectedLabels: map[string]string{ + "nvidia.com/mps.capable": "false", + }, + }, + { + description: "config with mps replicas no-mig-devices", + manager: &resource.ManagerMock{ + GetDevicesFunc: func() ([]resource.Device, error) { + devices := []resource.Device{ + &resource.DeviceMock{ + IsMigEnabledFunc: func() (bool, error) { + return false, nil + }, + }, + } + return devices, nil + }, + }, + config: &spec.Config{ + Sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Replicas: 2, + }, + }, + }, + }, + }, + expectedLabels: map[string]string{ + "nvidia.com/mps.capable": "true", + }, + }, + { + description: "config with mps replicas mig-devices", + manager: &resource.ManagerMock{ + GetDevicesFunc: func() ([]resource.Device, error) { + devices := []resource.Device{ + &resource.DeviceMock{ + IsMigEnabledFunc: func() (bool, error) { + return true, nil + }, + }, + } + return devices, nil + }, + }, + config: &spec.Config{ + Sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Replicas: 2, + }, + }, + }, + }, + }, + expectedError: errMPSSharingNotSupported, + expectedLabels: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + labels, err := newSharingLabeler(tc.manager, tc.config) + require.ErrorIs(t, err, tc.expectedError) + if tc.expectedError != nil { + require.Nil(t, labels) + } else { + require.EqualValues(t, tc.expectedLabels, labels) + } + }) + } +} + +func TestGPUModeLabeler(t *testing.T) { + testCases := []struct { + description string + devices []resource.Device + expectedError bool + expectedLabels map[string]string + }{ + { + description: "single device with compute PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "graphics", + }, + }, + { + description: "single device with graphics PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030200), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "compute", + }, + }, + { + description: "single device with switch PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x068000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "unknown", + }, + }, + { + description: "multiple device have same graphics PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030200), + rt.NewDeviceWithPCIClassMock(0x030200), + rt.NewDeviceWithPCIClassMock(0x030200), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "compute", + }, + }, + { + description: "multiple device have same compute PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030000), + rt.NewDeviceWithPCIClassMock(0x030000), + rt.NewDeviceWithPCIClassMock(0x030000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "graphics", + }, + }, + { + description: "multiple device with some with graphics and others with compute PCI class", + devices: []resource.Device{ + rt.NewDeviceWithPCIClassMock(0x030000), + rt.NewDeviceWithPCIClassMock(0x030200), + rt.NewDeviceWithPCIClassMock(0x030000), + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.mode": "unknown", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + + gpuModeLabeler, _ := newGPUModeLabeler(tc.devices) + + labels, err := gpuModeLabeler.Labels() + if tc.expectedError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + require.EqualValues(t, tc.expectedLabels, labels) + }) + } +} diff --git a/pkg/nvidia-plugin/pkg/lm/output.go b/pkg/nvidia-plugin/pkg/lm/output.go new file mode 100644 index 000000000..16a1af3ff --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/output.go @@ -0,0 +1,155 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "bytes" + "context" + "fmt" + "io" + "os" + "strings" + + apiequality "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1" + nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" + + "github.com/google/renameio" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/flags" +) + +// Outputer defines a mechanism to output labels. +type Outputer interface { + Output(Labels) error +} + +// TODO: Replace this with functional options. +func NewOutputer(config *spec.Config, nodeConfig flags.NodeConfig, clientSets flags.ClientSets) (Outputer, error) { + if config.Flags.UseNodeFeatureAPI == nil || !*config.Flags.UseNodeFeatureAPI { + return ToFile(*config.Flags.GFD.OutputFile), nil + } + + if nodeConfig.Name == "" { + return nil, fmt.Errorf("required flag node-name not set") + } + if nodeConfig.Namespace == "" { + return nil, fmt.Errorf("required flag namespace not set") + } + o := nodeFeatureObject{ + nodeConfig: nodeConfig, + nfdClientset: clientSets.NFD, + } + return &o, nil +} + +func ToFile(path string) Outputer { + if path == "" { + return &toWriter{os.Stdout} + } + + o := toFile(path) + return &o +} + +// toFile writes to the specified file. +type toFile string + +// toWriter writes to the specified writer +type toWriter struct { + io.Writer +} + +func (path *toFile) Output(labels Labels) error { + klog.Infof("Writing labels to output file %v", *path) + + buffer := new(bytes.Buffer) + output := &toWriter{buffer} + if err := output.Output(labels); err != nil { + return fmt.Errorf("error writing labels to buffer: %v", err) + } + // write file atomically + if err := renameio.WriteFile(string(*path), buffer.Bytes(), 0644); err != nil { + return fmt.Errorf("error atomically writing file '%s': %w", *path, err) + } + return nil +} + +func (output *toWriter) Output(labels Labels) error { + for k, v := range labels { + _, err := fmt.Fprintf(output, "%s=%s\n", k, v) + if err != nil { + return err + } + } + return nil +} + +const nodeFeatureVendorPrefix = "nvidia-features-for" + +type nodeFeatureObject struct { + nodeConfig flags.NodeConfig + nfdClientset nfdclientset.Interface +} + +// UpdateNodeFeatureObject creates/updates the node-specific NodeFeature custom resource. +func (n *nodeFeatureObject) Output(labels Labels) error { + nodename := n.nodeConfig.Name + if nodename == "" { + return fmt.Errorf("required flag %q not set", "node-name") + } + namespace := n.nodeConfig.Namespace + nodeFeatureName := strings.Join([]string{nodeFeatureVendorPrefix, nodename}, "-") + + if nfr, err := n.nfdClientset.NfdV1alpha1().NodeFeatures(namespace).Get(context.TODO(), nodeFeatureName, metav1.GetOptions{}); errors.IsNotFound(err) { + klog.Infof("creating NodeFeature object %s", nodeFeatureName) + nfr = &nfdv1alpha1.NodeFeature{ + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{Name: nodeFeatureName, Labels: map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename}}, + Spec: nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels}, + } + + nfrCreated, err := n.nfdClientset.NfdV1alpha1().NodeFeatures(namespace).Create(context.TODO(), nfr, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create NodeFeature object %q: %w", nfr.Name, err) + } + + klog.Infof("NodeFeature object created: %v", nfrCreated) + } else if err != nil { + return fmt.Errorf("failed to get NodeFeature object: %w", err) + } else { + nfrUpdated := nfr.DeepCopy() + nfrUpdated.Labels = map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename} + nfrUpdated.Spec = nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels} + + if !apiequality.Semantic.DeepEqual(nfr, nfrUpdated) { + klog.Infof("updating NodeFeature object %s", nodeFeatureName) + nfrUpdated, err = n.nfdClientset.NfdV1alpha1().NodeFeatures(namespace).Update(context.TODO(), nfrUpdated, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update NodeFeature object %q: %w", nfr.Name, err) + } + klog.Infof("NodeFeature object updated: %v", nfrUpdated) + } else { + klog.Infof("no changes in NodeFeature object, not updating") + } + } + return nil +} diff --git a/pkg/nvidia-plugin/pkg/lm/resource.go b/pkg/nvidia-plugin/pkg/lm/resource.go new file mode 100644 index 000000000..799adea4c --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/resource.go @@ -0,0 +1,319 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "fmt" + "regexp" + "strings" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" +) + +const fullGPUResourceName = "nvidia.com/gpu" + +// NewGPUResourceLabelerWithoutSharing creates a resource labeler for the specified device that does not apply sharing labels. +func NewGPUResourceLabelerWithoutSharing(device resource.Device, count int) (Labeler, error) { + // NOTE: We use a nil config to signal that sharing is disabled. + return NewGPUResourceLabeler(nil, device, count) +} + +// NewGPUResourceLabeler creates a resource labeler for the specified full GPU device with the specified count +func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count int) (Labeler, error) { + if count == 0 { + return empty{}, nil + } + + model, err := device.GetName() + if err != nil { + return nil, fmt.Errorf("failed to get device model: %v", err) + } + + totalMemoryMB, err := device.GetTotalMemoryMB() + if err != nil { + return nil, fmt.Errorf("failed to get memory info for device: %v", err) + } + + resourceLabeler := newResourceLabeler(fullGPUResourceName, config) + + architectureLabels, err := newArchitectureLabels(resourceLabeler, device) + if err != nil { + return nil, fmt.Errorf("failed to create architecture labels: %v", err) + } + + memoryLabeler := (Labeler)(&empty{}) + if totalMemoryMB != 0 { + memoryLabeler = resourceLabeler.single("memory", totalMemoryMB) + } + + labelers := Merge( + resourceLabeler.baseLabeler(count, model), + memoryLabeler, + architectureLabels, + ) + + return labelers, nil +} + +// NewMIGResourceLabeler creates a resource labeler for the specified full GPU device with the specified resource name. +func NewMIGResourceLabeler(resourceName spec.ResourceName, config *spec.Config, device resource.Device, count int) (Labeler, error) { + if count == 0 { + return empty{}, nil + } + + parent, err := device.GetDeviceHandleFromMigDeviceHandle() + if err != nil { + return nil, fmt.Errorf("failed to get parent of MIG device: %v", err) + } + model, err := parent.GetName() + if err != nil { + return nil, fmt.Errorf("failed to get device model: %v", err) + } + + migProfile, err := device.GetName() + if err != nil { + return nil, fmt.Errorf("failed to get MIG profile name: %v", err) + } + + resourceLabeler := newResourceLabeler(resourceName, config) + + attributeLabels, err := newMigAttributeLabels(resourceLabeler, device) + if err != nil { + return nil, fmt.Errorf("faled to get MIG attribute labels: %v", err) + } + + labelers := Merge( + resourceLabeler.baseLabeler(count, model, "MIG", migProfile), + attributeLabels, + ) + + return labelers, nil +} + +func newResourceLabeler(resourceName spec.ResourceName, config *spec.Config) resourceLabeler { + var sharing *spec.Sharing + if config != nil { + sharing = &config.Sharing + } + return resourceLabeler{ + resourceName: resourceName, + sharing: sharing, + } + +} + +type resourceLabeler struct { + resourceName spec.ResourceName + sharing *spec.Sharing +} + +// single creates a single label for the resource. The label key is +// .suffix +func (rl resourceLabeler) single(suffix string, value interface{}) Labels { + return rl.labels(map[string]interface{}{suffix: value}) + +} + +// labels creates a set of labels from the specified map for the resource. +// Each key in the map corresponds to a label .key +func (rl resourceLabeler) labels(suffixValues map[string]interface{}) Labels { + labels := make(Labels) + for suffix, value := range suffixValues { + rl.updateLabel(labels, suffix, value) + } + + return labels +} + +// updateLabel modifies the specified labels, updating .suffix with +// the provided value. +func (rl resourceLabeler) updateLabel(labels Labels, suffix string, value interface{}) { + key := rl.key(suffix) + + labels[key] = fmt.Sprintf("%v", value) +} + +// key generates the label key for the specified suffix. The key is generated as +// .suffix +func (rl resourceLabeler) key(suffix string) string { + return string(rl.resourceName) + "." + suffix +} + +// baseLabeler generates the product, count, and replicas labels for the resource +func (rl resourceLabeler) baseLabeler(count int, parts ...string) Labeler { + replicas := rl.getReplicas() + strategy := spec.SharingStrategyNone + if rl.sharing != nil && replicas > 1 { + strategy = rl.sharing.SharingStrategy() + } + rawLabels := map[string]interface{}{ + "product": rl.getProductName(parts...), + "count": count, + "replicas": replicas, + "sharing-strategy": strategy, + } + + labels := make(Labels) + for k, v := range rawLabels { + labels[rl.key(k)] = fmt.Sprintf("%v", v) + } + return labels +} + +// Deprecated +func (rl resourceLabeler) productLabel(parts ...string) Labels { + name := rl.getProductName(parts...) + if name == "" { + return make(Labels) + } + return rl.single("product", name) +} + +func (rl resourceLabeler) getProductName(parts ...string) string { + var strippedParts []string + for _, p := range parts { + if p != "" { + sanitisedPart := sanitise(p) + strippedParts = append(strippedParts, sanitisedPart) + } + } + + if len(strippedParts) == 0 { + return "" + } + + if rl.isShared() && !rl.isRenamed() { + strippedParts = append(strippedParts, "SHARED") + } + return strings.Join(strippedParts, "-") +} + +func (rl resourceLabeler) getReplicas() int { + if rl.sharingDisabled() { + return 0 + } else if r := rl.replicationInfo(); r != nil && r.Replicas > 0 { + return r.Replicas + } + return 1 +} + +// sharingDisabled checks whether the resourceLabeler has sharing disabled +// TODO: The nil check here is because we call NewGPUResourceLabeler with a nil config when sharing is disabled. +func (rl resourceLabeler) sharingDisabled() bool { + return rl.sharing == nil +} + +// isShared checks whether the resource is shared. +func (rl resourceLabeler) isShared() bool { + if r := rl.replicationInfo(); r != nil && r.Replicas > 1 { + return true + } + return false +} + +// isRenamed checks whether the resource is renamed. +func (rl resourceLabeler) isRenamed() bool { + if r := rl.replicationInfo(); r != nil && r.Rename != "" { + return true + } + return false +} + +// replicationInfo searches the associated config for the resource and returns the replication info +func (rl resourceLabeler) replicationInfo() *spec.ReplicatedResource { + if rl.sharingDisabled() { + return nil + } + for _, r := range rl.sharing.ReplicatedResources().Resources { + if r.Name == rl.resourceName { + return &r + } + } + return nil +} + +func newMigAttributeLabels(rl resourceLabeler, device resource.Device) (Labels, error) { + attributes, err := device.GetAttributes() + if err != nil { + return nil, fmt.Errorf("unable to get attributes of MIG device: %v", err) + } + + labels := rl.labels(attributes) + + return labels, nil +} + +func newArchitectureLabels(rl resourceLabeler, device resource.Device) (Labels, error) { + computeMajor, computeMinor, err := device.GetCudaComputeCapability() + if err != nil { + return nil, fmt.Errorf("failed to determine CUDA compute capability: %v", err) + } + + if computeMajor == 0 { + return make(Labels), nil + } + + family := getArchFamily(computeMajor, computeMinor) + + labels := rl.labels(map[string]interface{}{ + "family": family, + "compute.major": computeMajor, + "compute.minor": computeMinor, + }) + + return labels, nil +} + +// TODO: This should a function in go-nvlib +func getArchFamily(computeMajor, computeMinor int) string { + switch computeMajor { + case 1: + return "tesla" + case 2: + return "fermi" + case 3: + return "kepler" + case 5: + return "maxwell" + case 6: + return "pascal" + case 7: + if computeMinor < 5 { + return "volta" + } + return "turing" + case 8: + if computeMinor < 9 { + return "ampere" + } + return "ada-lovelace" + case 9: + return "hopper" + } + return "undefined" +} + +func sanitise(input string) string { + var sanitised string + re := regexp.MustCompile("[^A-Za-z0-9-_. ]") + input = re.ReplaceAllString(input, "") + // remove redundant blank spaces + sanitised = strings.Join(strings.Fields(input), "-") + + return sanitised +} diff --git a/pkg/nvidia-plugin/pkg/lm/resource_test.go b/pkg/nvidia-plugin/pkg/lm/resource_test.go new file mode 100644 index 000000000..24b279bea --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/resource_test.go @@ -0,0 +1,437 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "testing" + + "github.com/stretchr/testify/require" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + rt "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource/testing" +) + +func TestGPUResourceLabeler(t *testing.T) { + device := rt.NewFullGPU() + + testCases := []struct { + description string + count int + sharing spec.Sharing + expectedLabels Labels + }{ + { + description: "zero count returns empty", + }, + { + description: "no sharing", + count: 1, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + }, + }, + { + description: "time-slicing ignores non-matching resource", + count: 1, + sharing: spec.Sharing{ + TimeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/not-gpu", + Replicas: 2, + }, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + }, + }, + { + description: "time-slicing appends suffix and doubles count", + count: 1, + sharing: spec.Sharing{ + TimeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "time-slicing", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-SHARED", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + }, + }, + { + description: "time-slicing renamed does not append suffix and doubles count", + count: 1, + sharing: spec.Sharing{ + TimeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Rename: "nvidia.com/gpu.shared", + Replicas: 2, + }, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "time-slicing", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + }, + }, + { + description: "mps ignores non-matching resource", + count: 1, + sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/not-gpu", + Replicas: 2, + }, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + }, + }, + { + description: "mps appends suffix and doubles count", + count: 1, + sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "mps", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-SHARED", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + }, + }, + { + description: "mps renamed does not append suffix and doubles count", + count: 1, + sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Rename: "nvidia.com/gpu.shared", + Replicas: 2, + }, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "mps", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL", + "nvidia.com/gpu.family": "ampere", + "nvidia.com/gpu.compute.major": "8", + "nvidia.com/gpu.compute.minor": "0", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + config := &spec.Config{ + Sharing: tc.sharing, + } + l, err := NewGPUResourceLabeler(config, device, tc.count) + require.NoError(t, err) + + labels, err := l.Labels() + require.NoError(t, err) + + require.EqualValues(t, tc.expectedLabels, labels) + }) + } + +} + +func TestSanitise(t *testing.T) { + testCases := []struct { + input string + expected string + }{ + { + input: "a space separated string", + expected: "a-space-separated-string", + }, + { + input: "some(thing)else", + expected: "somethingelse", + }, + { + input: "some ( thing )else", + expected: "some-thing-else", + }, + { + input: "NVIDIA-TITAN-X-(Pascal)", + expected: "NVIDIA-TITAN-X-Pascal", + }, + { + input: " input with multiple spaces ", + expected: "input-with-multiple-spaces", + }, + { + input: "some [ / thing / ]else", + expected: "some-thing-else", + }, + { + input: "some / thing /else", + expected: "some-thing-else", + }, + { + input: "some-thing.else_new", + expected: "some-thing.else_new", + }, + } + for _, tc := range testCases { + t.Run(tc.input, func(t *testing.T) { + require.EqualValues(t, tc.expected, sanitise(tc.input)) + }) + } + +} + +func TestMigResourceLabeler(t *testing.T) { + + device := rt.NewMigDevice(1, 2, 300) + rt.NewMigEnabledDevice(device) + + testCases := []struct { + description string + resourceName spec.ResourceName + count int + timeSlicing spec.ReplicatedResources + expectedLabels Labels + }{ + { + description: "zero count returns empty", + }, + { + description: "no sharing", + resourceName: "nvidia.com/gpu", + count: 1, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-1g.300gb", + "nvidia.com/gpu.multiprocessors": "0", + "nvidia.com/gpu.slices.gi": "1", + "nvidia.com/gpu.slices.ci": "2", + "nvidia.com/gpu.engines.copy": "0", + "nvidia.com/gpu.engines.decoder": "0", + "nvidia.com/gpu.engines.encoder": "0", + "nvidia.com/gpu.engines.jpeg": "0", + "nvidia.com/gpu.engines.ofa": "0", + }, + }, + { + description: "shared appends suffix and doubles count", + resourceName: "nvidia.com/gpu", + count: 1, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "time-slicing", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-1g.300gb-SHARED", + "nvidia.com/gpu.multiprocessors": "0", + "nvidia.com/gpu.slices.gi": "1", + "nvidia.com/gpu.slices.ci": "2", + "nvidia.com/gpu.engines.copy": "0", + "nvidia.com/gpu.engines.decoder": "0", + "nvidia.com/gpu.engines.encoder": "0", + "nvidia.com/gpu.engines.jpeg": "0", + "nvidia.com/gpu.engines.ofa": "0", + }, + }, + { + description: "renamed does not append suffix and doubles count", + resourceName: "nvidia.com/gpu", + count: 1, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Rename: "nvidia.com/gpu.shared", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/gpu.count": "1", + "nvidia.com/gpu.replicas": "2", + "nvidia.com/gpu.sharing-strategy": "time-slicing", + "nvidia.com/gpu.memory": "300", + "nvidia.com/gpu.product": "MOCKMODEL-MIG-1g.300gb", + "nvidia.com/gpu.multiprocessors": "0", + "nvidia.com/gpu.slices.gi": "1", + "nvidia.com/gpu.slices.ci": "2", + "nvidia.com/gpu.engines.copy": "0", + "nvidia.com/gpu.engines.decoder": "0", + "nvidia.com/gpu.engines.encoder": "0", + "nvidia.com/gpu.engines.jpeg": "0", + "nvidia.com/gpu.engines.ofa": "0", + }, + }, + { + description: "mig mixed appends shared", + resourceName: "nvidia.com/mig-1g.1gb", + count: 1, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Rename: "nvidia.com/gpu.shared", + Replicas: 2, + }, + { + Name: "nvidia.com/mig-1g.1gb", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/mig-1g.1gb.count": "1", + "nvidia.com/mig-1g.1gb.replicas": "2", + "nvidia.com/mig-1g.1gb.sharing-strategy": "time-slicing", + "nvidia.com/mig-1g.1gb.memory": "300", + "nvidia.com/mig-1g.1gb.product": "MOCKMODEL-MIG-1g.300gb-SHARED", + "nvidia.com/mig-1g.1gb.multiprocessors": "0", + "nvidia.com/mig-1g.1gb.slices.gi": "1", + "nvidia.com/mig-1g.1gb.slices.ci": "2", + "nvidia.com/mig-1g.1gb.engines.copy": "0", + "nvidia.com/mig-1g.1gb.engines.decoder": "0", + "nvidia.com/mig-1g.1gb.engines.encoder": "0", + "nvidia.com/mig-1g.1gb.engines.jpeg": "0", + "nvidia.com/mig-1g.1gb.engines.ofa": "0", + }, + }, + { + description: "mig mixed rename does not append", + resourceName: "nvidia.com/mig-1g.1gb", + count: 1, + timeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/mig-1g.1gb", + Rename: "nvidia.com/mig-1g.1gb.shared", + Replicas: 2, + }, + }, + }, + expectedLabels: Labels{ + "nvidia.com/mig-1g.1gb.count": "1", + "nvidia.com/mig-1g.1gb.replicas": "2", + "nvidia.com/mig-1g.1gb.sharing-strategy": "time-slicing", + "nvidia.com/mig-1g.1gb.memory": "300", + "nvidia.com/mig-1g.1gb.product": "MOCKMODEL-MIG-1g.300gb", + "nvidia.com/mig-1g.1gb.multiprocessors": "0", + "nvidia.com/mig-1g.1gb.slices.gi": "1", + "nvidia.com/mig-1g.1gb.slices.ci": "2", + "nvidia.com/mig-1g.1gb.engines.copy": "0", + "nvidia.com/mig-1g.1gb.engines.decoder": "0", + "nvidia.com/mig-1g.1gb.engines.encoder": "0", + "nvidia.com/mig-1g.1gb.engines.jpeg": "0", + "nvidia.com/mig-1g.1gb.engines.ofa": "0", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + config := &spec.Config{ + Sharing: spec.Sharing{ + TimeSlicing: tc.timeSlicing, + }, + } + l, err := NewMIGResourceLabeler(tc.resourceName, config, device, tc.count) + require.NoError(t, err) + + labels, err := l.Labels() + require.NoError(t, err) + + require.EqualValues(t, tc.expectedLabels, labels) + }) + } +} diff --git a/pkg/nvidia-plugin/pkg/lm/strategy.go b/pkg/nvidia-plugin/pkg/lm/strategy.go new file mode 100644 index 000000000..170adc336 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/strategy.go @@ -0,0 +1,28 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +// migStrategyLabeler creates a labler for setting the mig strategy label +func migStrategyLabeler(strategy string) Labeler { + if strategy == MigStrategyNone { + return empty{} + } + + return Labels{ + "nvidia.com/mig.strategy": strategy, + } +} diff --git a/pkg/nvidia-plugin/pkg/lm/timestamp.go b/pkg/nvidia-plugin/pkg/lm/timestamp.go new file mode 100644 index 000000000..9ea1d0e1e --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/timestamp.go @@ -0,0 +1,37 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "fmt" + "time" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +// NewTimestampLabeler creates a new label manager for generating timestamp +// labels from the specified config. If the noTimestamp option is set an empty +// label manager is returned. +func NewTimestampLabeler(config *spec.Config) Labeler { + if *config.Flags.GFD.NoTimestamp { + return empty{} + } + + return Labels{ + "nvidia.com/gfd.timestamp": fmt.Sprintf("%d", time.Now().Unix()), + } +} diff --git a/pkg/nvidia-plugin/pkg/lm/vgpu.go b/pkg/nvidia-plugin/pkg/lm/vgpu.go new file mode 100644 index 000000000..6c61169b7 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/lm/vgpu.go @@ -0,0 +1,58 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "fmt" + "strconv" + + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/vgpu" +) + +// vgpuLabeler manages VGPUs labels for the node +type vgpuLabeler struct { + lib vgpu.Interface +} + +// NewVGPULabeler creates a new VGP label manager using the provided vgpu library +// and config. +func NewVGPULabeler(vgpu vgpu.Interface) Labeler { + return vgpuLabeler{lib: vgpu} +} + +// Labels generates the VGPU labels for the node +func (manager vgpuLabeler) Labels() (Labels, error) { + devices, err := manager.lib.Devices() + if err != nil { + klog.ErrorS(err, "unable to get vGPU devices") + return nil, nil + } + labels := Labels{ + "nvidia.com/vgpu.present": strconv.FormatBool(len(devices) > 0), + } + for _, device := range devices { + info, err := device.GetInfo() + if err != nil { + return nil, fmt.Errorf("error getting vGPU device info: %v", err) + } + labels["nvidia.com/vgpu.host-driver-version"] = info.HostDriverVersion + labels["nvidia.com/vgpu.host-driver-branch"] = info.HostDriverBranch + } + return labels, nil +} diff --git a/pkg/nvidia-plugin/pkg/logger/klog.go b/pkg/nvidia-plugin/pkg/logger/klog.go new file mode 100644 index 000000000..5cbfba7d0 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/logger/klog.go @@ -0,0 +1,34 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package logger + +import "k8s.io/klog/v2" + +type toKlog struct{} + +// ToKlog allows the klog logger to be passed to functions where this is needed. +var ToKlog = &toKlog{} + +// Warning forwards the arguments to the klog.Warning function. +func (l toKlog) Warning(args ...interface{}) { + klog.Warning(args...) +} + +// Warningf forwards the arguments to the klog.Warningf function. +func (l toKlog) Warningf(format string, args ...interface{}) { + klog.Warningf(format, args...) +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go b/pkg/nvidia-plugin/pkg/mig/mig-dp.go similarity index 70% rename from pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go rename to pkg/nvidia-plugin/pkg/mig/mig-dp.go index cc38b4d25..f3121d975 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go +++ b/pkg/nvidia-plugin/pkg/mig/mig-dp.go @@ -1,34 +1,4 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ +// Copyright (c) 2021 - 2022, NVIDIA CORPORATION. All rights reserved. package mig @@ -49,7 +19,7 @@ const ( nvcapsDevicePath = "/dev/nvidia-caps" ) -// GetMigCapabilityDevicePaths returns a mapping of MIG capability path to device node path. +// GetMigCapabilityDevicePaths returns a mapping of MIG capability path to device node path func GetMigCapabilityDevicePaths() (map[string]string, error) { // Open nvcapsMigMinorsPath for walking. // If the nvcapsMigMinorsPath does not exist, then we are not on a MIG diff --git a/pkg/nvidia-plugin/pkg/mig/mig.go b/pkg/nvidia-plugin/pkg/mig/mig.go new file mode 100644 index 000000000..0d5754209 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/mig/mig.go @@ -0,0 +1,124 @@ +/** +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package mig + +import ( + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" +) + +// DeviceInfo stores information about all devices on the node +type DeviceInfo struct { + // The NVML library + manager resource.Manager + // devicesMap holds a list of devices, separated by whether they have MigEnabled or not + devicesMap map[bool][]resource.Device +} + +// NewDeviceInfo creates a new DeviceInfo struct and returns a pointer to it. +func NewDeviceInfo(manager resource.Manager) *DeviceInfo { + return &DeviceInfo{ + manager: manager, + devicesMap: nil, // Is initialized on first use + } +} + +// GetDevicesMap returns the list of devices separated by whether they have MIG enabled. +// The first call will construct the map. +func (di *DeviceInfo) GetDevicesMap() (map[bool][]resource.Device, error) { + if di.devicesMap != nil { + return di.devicesMap, nil + } + + devices, err := di.manager.GetDevices() + if err != nil { + return nil, err + } + + migEnabledDevicesMap := make(map[bool][]resource.Device) + for _, d := range devices { + isMigEnabled, err := d.IsMigEnabled() + if err != nil { + return nil, err + } + + migEnabledDevicesMap[isMigEnabled] = append(migEnabledDevicesMap[isMigEnabled], d) + } + + di.devicesMap = migEnabledDevicesMap + + return di.devicesMap, nil +} + +// GetDevicesWithMigEnabled returns a list of devices with migEnabled=true +func (di *DeviceInfo) GetDevicesWithMigEnabled() ([]resource.Device, error) { + devicesMap, err := di.GetDevicesMap() + if err != nil { + return nil, err + } + return devicesMap[true], nil +} + +// GetDevicesWithMigDisabled returns a list of devices with migEnabled=false +func (di *DeviceInfo) GetDevicesWithMigDisabled() ([]resource.Device, error) { + devicesMap, err := di.GetDevicesMap() + if err != nil { + return nil, err + } + return devicesMap[false], nil +} + +// AnyMigEnabledDeviceIsEmpty checks whether at least one MIG device has no MIG devices configured +func (di *DeviceInfo) AnyMigEnabledDeviceIsEmpty() (bool, error) { + devicesMap, err := di.GetDevicesMap() + if err != nil { + return false, err + } + + if len(devicesMap[true]) == 0 { + // By definition the property is true for the empty set + return true, nil + } + + for _, d := range devicesMap[true] { + migs, err := d.GetMigDevices() + if err != nil { + return false, err + } + if len(migs) == 0 { + return true, nil + } + } + return false, nil +} + +// GetAllMigDevices returns a list of all MIG devices. +func (di *DeviceInfo) GetAllMigDevices() ([]resource.Device, error) { + devicesMap, err := di.GetDevicesMap() + if err != nil { + return nil, err + } + + var migs []resource.Device + for _, d := range devicesMap[true] { + devs, err := d.GetMigDevices() + if err != nil { + return nil, err + } + migs = append(migs, devs...) + } + return migs, nil +} diff --git a/pkg/nvidia-plugin/pkg/plugin/api.go b/pkg/nvidia-plugin/pkg/plugin/api.go new file mode 100644 index 000000000..ce37316f9 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/plugin/api.go @@ -0,0 +1,26 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package plugin + +import "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" + +// Interface defines the API for the plugin package +type Interface interface { + Devices() rm.Devices + Start(string) error + Stop() error +} diff --git a/pkg/nvidia-plugin/pkg/plugin/factory.go b/pkg/nvidia-plugin/pkg/plugin/factory.go new file mode 100644 index 000000000..cd9952d11 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/plugin/factory.go @@ -0,0 +1,138 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package plugin + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/cdi" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" +) + +type options struct { + infolib info.Interface + nvmllib nvml.Interface + devicelib device.Interface + + failOnInitError bool + + cdiHandler cdi.Interface + config *nvidia.DeviceConfig + + deviceListStrategies spec.DeviceListStrategies + + imexChannels imex.Channels +} + +// New a new set of plugins with the supplied options. +func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, opts ...Option) ([]Interface, error) { + o := &options{ + infolib: infolib, + nvmllib: nvmllib, + devicelib: devicelib, + } + for _, opt := range opts { + opt(o) + } + + if o.config == nil { + klog.Warning("no config provided, returning a null manager") + return nil, nil + } + + if o.cdiHandler == nil { + o.cdiHandler = cdi.NewNullHandler() + } + + resourceManagers, err := o.getResourceManagers() + if err != nil { + return nil, fmt.Errorf("failed to construct resource managers: %w", err) + } + + var plugins []Interface + for _, resourceManager := range resourceManagers { + plugin, err := o.devicePluginForResource(resourceManager) + if err != nil { + return nil, fmt.Errorf("failed to create plugin: %w", err) + } + plugins = append(plugins, plugin) + } + return plugins, nil +} + +// getResourceManager constructs a set of resource managers. +// Each resource manager maps to a specific named extended resource and may +// include full GPUs or MIG devices. +func (o *options) getResourceManagers() ([]rm.ResourceManager, error) { + strategy := o.resolveStrategy(*o.config.Flags.DeviceDiscoveryStrategy) + switch strategy { + case "nvml": + ret := o.nvmllib.Init() + if ret != nvml.SUCCESS { + klog.Errorf("Failed to initialize NVML: %v.", ret) + klog.Errorf("If this is a GPU node, did you set the docker default runtime to `nvidia`?") + klog.Errorf("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") + klog.Errorf("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + klog.Errorf("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") + if o.failOnInitError { + return nil, fmt.Errorf("nvml init failed: %v", ret) + } + klog.Warningf("nvml init failed: %v", ret) + return nil, nil + } + defer func() { + _ = o.nvmllib.Shutdown() + }() + + return rm.NewNVMLResourceManagers(o.infolib, o.nvmllib, o.devicelib, o.config) + case "tegra": + return rm.NewTegraResourceManagers(o.config) + default: + klog.Errorf("Incompatible strategy detected %v", strategy) + klog.Error("If this is a GPU node, did you configure the NVIDIA Container Toolkit?") + klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") + klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + klog.Error("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") + if o.failOnInitError { + return nil, fmt.Errorf("invalid device discovery strategy") + } + return nil, nil + } +} + +func (o *options) resolveStrategy(strategy string) string { + if strategy != "" && strategy != "auto" { + return strategy + } + + platform := o.infolib.ResolvePlatform() + switch platform { + case info.PlatformNVML, info.PlatformWSL: + return "nvml" + case info.PlatformTegra: + return "tegra" + } + return strategy +} diff --git a/pkg/nvidia-plugin/pkg/plugin/mps.go b/pkg/nvidia-plugin/pkg/plugin/mps.go new file mode 100644 index 000000000..c4b304f07 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/plugin/mps.go @@ -0,0 +1,91 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package plugin + +import ( + "errors" + "fmt" + + "k8s.io/klog/v2" + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/mps-control-daemon/mps" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" +) + +type mpsOptions struct { + enabled bool + resourceName spec.ResourceName + daemon *mps.Daemon + hostRoot mps.Root +} + +// getMPSOptions returns the MPS options specified for the resource manager. +// If MPS is not configured and empty set of options is returned. +func (o *options) getMPSOptions(resourceManager rm.ResourceManager) (mpsOptions, error) { + if o.config.Sharing.SharingStrategy() != spec.SharingStrategyMPS { + return mpsOptions{}, nil + } + + // TODO: It might make sense to pull this logic into a resource manager. + for _, device := range resourceManager.Devices() { + if device.IsMigDevice() { + return mpsOptions{}, errors.New("sharing using MPS is not supported for MIG devices") + } + } + + m := mpsOptions{ + enabled: true, + resourceName: resourceManager.Resource(), + daemon: mps.NewDaemon(resourceManager, mps.ContainerRoot), + hostRoot: mps.Root(*o.config.Flags.CommandLineFlags.MpsRoot), + } + return m, nil +} + +func (m *mpsOptions) waitForDaemon() error { + if m == nil || !m.enabled { + return nil + } + // TODO: Check the .ready file here. + // TODO: Have some retry strategy here. + if err := m.daemon.AssertHealthy(); err != nil { + return fmt.Errorf("error checking MPS daemon health: %w", err) + } + klog.InfoS("MPS daemon is healthy", "resource", m.resourceName) + return nil +} + +func (m *mpsOptions) updateReponse(response *pluginapi.ContainerAllocateResponse) { + if m == nil || !m.enabled { + return + } + // TODO: We should check that the deviceIDs are shared using MPS. + response.Envs["CUDA_MPS_PIPE_DIRECTORY"] = m.daemon.PipeDir() + + response.Mounts = append(response.Mounts, + &pluginapi.Mount{ + ContainerPath: m.daemon.PipeDir(), + HostPath: m.hostRoot.PipeDir(m.resourceName), + }, + &pluginapi.Mount{ + ContainerPath: m.daemon.ShmDir(), + HostPath: m.hostRoot.ShmDir(m.resourceName), + }, + ) +} diff --git a/pkg/nvidia-plugin/pkg/plugin/options.go b/pkg/nvidia-plugin/pkg/plugin/options.go new file mode 100644 index 000000000..7e3799248 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/plugin/options.go @@ -0,0 +1,79 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package plugin + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/cdi" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" +) + +// Option is a function that configures a options +type Option func(*options) + +// WithCDIHandler sets the CDI handler for the options +func WithCDIHandler(handler cdi.Interface) Option { + return func(m *options) { + m.cdiHandler = handler + } +} + +// WithDeviceListStrategies sets the device list strategies. +func WithDeviceListStrategies(deviceListStrategies spec.DeviceListStrategies) Option { + return func(m *options) { + m.deviceListStrategies = deviceListStrategies + } +} + +// WithNVML sets the NVML handler for the options +func WithNVML(nvmllib nvml.Interface) Option { + return func(m *options) { + m.nvmllib = nvmllib + } +} + +// WithInfoLib sets the info lib for the options. +func WithInfoLib(infolib info.Interface) Option { + return func(m *options) { + m.infolib = infolib + } +} + +// WithFailOnInitError sets whether the options should fail on initialization errors +func WithFailOnInitError(failOnInitError bool) Option { + return func(m *options) { + m.failOnInitError = failOnInitError + } +} + +// WithConfig sets the config reference for the options +func WithConfig(config *nvidia.DeviceConfig) Option { + return func(m *options) { + m.config = config + } +} + +// WithImexChannels sets the imex channels for the manager. +func WithImexChannels(imexChannels imex.Channels) Option { + return func(m *options) { + m.imexChannels = imexChannels + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go b/pkg/nvidia-plugin/pkg/plugin/register.go similarity index 98% rename from pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go rename to pkg/nvidia-plugin/pkg/plugin/register.go index 0da4db014..c78141ea9 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go +++ b/pkg/nvidia-plugin/pkg/plugin/register.go @@ -94,7 +94,7 @@ func parseNvidiaNumaInfo(idx int, nvidiaTopoStr string) (int, error) { } klog.V(5).InfoS("nvidia-smi topo -m row output", "row output", words, "length", len(words)) if strings.Contains(words[0], fmt.Sprint(idx)) { - if words[numaAffinityColumnIndex] == "N/A" { + if len(words) <= numaAffinityColumnIndex || words[numaAffinityColumnIndex] == "N/A" { klog.InfoS("current card has not established numa topology", "gpu row info", words, "index", idx) return 0, nil } diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go b/pkg/nvidia-plugin/pkg/plugin/register_test.go similarity index 91% rename from pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go rename to pkg/nvidia-plugin/pkg/plugin/register_test.go index 5c1530878..ab12dcd40 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go +++ b/pkg/nvidia-plugin/pkg/plugin/register_test.go @@ -47,7 +47,7 @@ func Test_parseNvidiaNumaInfo(t *testing.T) { name: "single Tesla P4 NUMA", idx: 0, nvidiaTopoStr: `GPU0 CPU Affinity NUMA Affinity ... - ...`, + ...`, want: 0, wantErr: false, }, @@ -55,7 +55,7 @@ func Test_parseNvidiaNumaInfo(t *testing.T) { name: "two Tesla P4 NUMA topo with index 0", idx: 0, nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... - ...`, + ...`, want: 0, wantErr: false, }, @@ -63,7 +63,15 @@ func Test_parseNvidiaNumaInfo(t *testing.T) { name: "two Tesla P4 NUMA topo with index 1", idx: 1, nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... - ...`, + ...`, + want: 0, + wantErr: false, + }, + { + name: "NUMA Affinity is empty", + idx: 0, + nvidiaTopoStr: `GPU0 CPU Affinity NUMA Affinity GPU NUMA ID + GPU0 X`, want: 0, wantErr: false, }, diff --git a/pkg/nvidia-plugin/pkg/plugin/server.go b/pkg/nvidia-plugin/pkg/plugin/server.go new file mode 100644 index 000000000..37643f21a --- /dev/null +++ b/pkg/nvidia-plugin/pkg/plugin/server.go @@ -0,0 +1,742 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package plugin + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "net" + "os" + "os/exec" + "path" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/google/uuid" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/klog/v2" + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" + + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/cdi" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" + "github.com/Project-HAMi/HAMi/pkg/util" +) + +const ( + deviceListEnvVar = "NVIDIA_VISIBLE_DEVICES" + deviceListAsVolumeMountsHostPath = "/dev/null" + deviceListAsVolumeMountsContainerPathRoot = "/var/run/nvidia-container-devices" + NodeLockNvidia = "hami.io/mutex.lock" +) + +var ( + hostHookPath string + ConfigFile *string +) + +func init() { + hostHookPath, _ = os.LookupEnv("HOOK_PATH") +} + +// NvidiaDevicePlugin implements the Kubernetes device plugin API +type NvidiaDevicePlugin struct { + rm rm.ResourceManager + config *nvidia.DeviceConfig + deviceListStrategies spec.DeviceListStrategies + + cdiHandler cdi.Interface + cdiAnnotationPrefix string + + socket string + server *grpc.Server + health chan *rm.Device + stop chan interface{} + + imexChannels imex.Channels + + mps mpsOptions + + operatingMode string + migCurrent nvidia.MigPartedSpec + schedulerConfig nvidia.NvidiaConfig +} + +// devicePluginForResource creates a device plugin for the specified resource. +func (o *options) devicePluginForResource(resourceManager rm.ResourceManager) (Interface, error) { + mpsOptions, err := o.getMPSOptions(resourceManager) + if err != nil { + return nil, err + } + sConfig, mode, err := LoadNvidiaDevicePluginConfig() + if err != nil { + return nil, fmt.Errorf("failed to load nvidia plugin config: %v", err) + } + + // Initialize devices with configuration + if err := device.InitDevicesWithConfig(sConfig); err != nil { + klog.Fatalf("failed to initialize devices: %v", err) + } + + plugin := NvidiaDevicePlugin{ + rm: resourceManager, + config: o.config, + deviceListStrategies: o.deviceListStrategies, + + cdiHandler: o.cdiHandler, + cdiAnnotationPrefix: *o.config.Flags.Plugin.CDIAnnotationPrefix, + + imexChannels: o.imexChannels, + + mps: mpsOptions, + + socket: getPluginSocketPath(resourceManager.Resource()), + // These will be reinitialized every + // time the plugin server is restarted. + server: nil, + health: nil, + stop: nil, + + // initialize the the Hami fields + operatingMode: mode, + schedulerConfig: sConfig.NvidiaConfig, + migCurrent: nvidia.MigPartedSpec{}, + } + return &plugin, nil +} + +func readFromConfigFile(sConfig *nvidia.NvidiaConfig) (string, error) { + jsonByte, err := os.ReadFile("/config/config.json") + mode := "hami-core" + if err != nil { + return "", err + } + var deviceConfigs nvidia.DevicePluginConfigs + err = json.Unmarshal(jsonByte, &deviceConfigs) + if err != nil { + return "", err + } + klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs)) + for _, val := range deviceConfigs.Nodeconfig { + if os.Getenv(util.NodeNameEnvName) == val.Name { + klog.Infof("Reading config from file %s", val.Name) + if val.Devicememoryscaling > 0 { + sConfig.DeviceMemoryScaling = val.Devicememoryscaling + } + if val.Devicecorescaling > 0 { + sConfig.DeviceCoreScaling = val.Devicecorescaling + } + if val.Devicesplitcount > 0 { + sConfig.DeviceSplitCount = val.Devicesplitcount + } + if val.FilterDevice != nil && (len(val.FilterDevice.UUID) > 0 || len(val.FilterDevice.Index) > 0) { + nvidia.DevicePluginFilterDevice = val.FilterDevice + } + if len(val.OperatingMode) > 0 { + mode = val.OperatingMode + } + klog.Infof("FilterDevice: %v", val.FilterDevice) + } + } + return mode, nil +} + +func LoadNvidiaDevicePluginConfig() (*device.Config, string, error) { + sConfig, err := device.LoadConfig(*ConfigFile) + if err != nil { + klog.Fatalf(`failed to load device config file %s: %v`, *ConfigFile, err) + } + mode, err := readFromConfigFile(&sConfig.NvidiaConfig) + if err != nil { + klog.Errorf("readFromConfigFile err:%s", err.Error()) + } + return sConfig, mode, nil +} + +// getPluginSocketPath returns the socket to use for the specified resource. +func getPluginSocketPath(resource spec.ResourceName) string { + _, name := resource.Split() + pluginName := "nvidia-" + name + return filepath.Join(pluginapi.DevicePluginPath, pluginName) + ".sock" +} + +func (plugin *NvidiaDevicePlugin) initialize() { + plugin.server = grpc.NewServer([]grpc.ServerOption{}...) + plugin.health = make(chan *rm.Device) + plugin.stop = make(chan interface{}) +} + +func (plugin *NvidiaDevicePlugin) cleanup() { + close(plugin.stop) + plugin.server = nil + plugin.health = nil + plugin.stop = nil +} + +// Devices returns the full set of devices associated with the plugin. +func (plugin *NvidiaDevicePlugin) Devices() rm.Devices { + return plugin.rm.Devices() +} + +// Start starts the gRPC server, registers the device plugin with the Kubelet, +// and starts the device healthchecks. +func (plugin *NvidiaDevicePlugin) Start(kubeletSocket string) error { + plugin.initialize() + + if err := plugin.mps.waitForDaemon(); err != nil { + return fmt.Errorf("error waiting for MPS daemon: %w", err) + } + + err := plugin.Serve() + if err != nil { + klog.Errorf("Could not start device plugin for '%s': %s", plugin.rm.Resource(), err) + plugin.cleanup() + return err + } + klog.Infof("Starting to serve '%s' on %s", plugin.rm.Resource(), plugin.socket) + + err = plugin.Register(kubeletSocket) + if err != nil { + klog.Errorf("Could not register device plugin: %s", err) + return errors.Join(err, plugin.Stop()) + } + klog.Infof("Registered device plugin for '%s' with Kubelet", plugin.rm.Resource()) + + if plugin.operatingMode == "mig" { + cmd := exec.Command("nvidia-mig-parted", "export") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + err := cmd.Run() + if err != nil { + klog.Fatalf("nvidia-mig-parted failed with %s\n", err) + } + outStr := stdout.Bytes() + yaml.Unmarshal(outStr, &plugin.migCurrent) + os.WriteFile("/tmp/migconfig.yaml", outStr, os.ModePerm) + if len(plugin.migCurrent.MigConfigs["current"]) == 1 && len(plugin.migCurrent.MigConfigs["current"][0].Devices) == 0 { + idx := 0 + plugin.migCurrent.MigConfigs["current"][0].Devices = make([]int32, 0) + for idx < GetDeviceNums() { + plugin.migCurrent.MigConfigs["current"][0].Devices = append(plugin.migCurrent.MigConfigs["current"][0].Devices, int32(idx)) + idx++ + } + } + klog.Infoln("Mig export", plugin.migCurrent) + } + + go func() { + // TODO: add MPS health check + err := plugin.rm.CheckHealth(plugin.stop, plugin.health) + if err != nil { + klog.Errorf("Failed to start health check: %v; continuing with health checks disabled", err) + } + }() + + go func() { + plugin.WatchAndRegister() + }() + + return nil +} + +// Stop stops the gRPC server. +func (plugin *NvidiaDevicePlugin) Stop() error { + if plugin == nil || plugin.server == nil { + return nil + } + klog.Infof("Stopping to serve '%s' on %s", plugin.rm.Resource(), plugin.socket) + plugin.server.Stop() + if err := os.Remove(plugin.socket); err != nil && !os.IsNotExist(err) { + return err + } + plugin.cleanup() + return nil +} + +// Serve starts the gRPC server of the device plugin. +func (plugin *NvidiaDevicePlugin) Serve() error { + os.Remove(plugin.socket) + sock, err := net.Listen("unix", plugin.socket) + if err != nil { + return err + } + + pluginapi.RegisterDevicePluginServer(plugin.server, plugin) + + go func() { + lastCrashTime := time.Now() + restartCount := 0 + + for { + // quite if it has been restarted too often + // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time + if restartCount > 5 { + // quit + klog.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", plugin.rm.Resource()) + } + + klog.Infof("Starting GRPC server for '%s'", plugin.rm.Resource()) + err := plugin.server.Serve(sock) + if err == nil { + break + } + + klog.Infof("GRPC server for '%s' crashed with error: %v", plugin.rm.Resource(), err) + + timeSinceLastCrash := time.Since(lastCrashTime).Seconds() + lastCrashTime = time.Now() + if timeSinceLastCrash > 3600 { + // it has been one hour since the last crash.. reset the count + // to reflect on the frequency + restartCount = 0 + } else { + restartCount++ + } + } + }() + + // Wait for server to start by launching a blocking connection + conn, err := plugin.dial(plugin.socket, 5*time.Second) + if err != nil { + return err + } + conn.Close() + + return nil +} + +// Register registers the device plugin for the given resourceName with Kubelet. +func (plugin *NvidiaDevicePlugin) Register(kubeletSocket string) error { + if kubeletSocket == "" { + klog.Info("Skipping registration with Kubelet") + return nil + } + + conn, err := plugin.dial(kubeletSocket, 5*time.Second) + if err != nil { + return err + } + defer conn.Close() + + client := pluginapi.NewRegistrationClient(conn) + reqt := &pluginapi.RegisterRequest{ + Version: pluginapi.Version, + Endpoint: path.Base(plugin.socket), + ResourceName: string(plugin.rm.Resource()), + Options: &pluginapi.DevicePluginOptions{ + GetPreferredAllocationAvailable: true, + }, + } + + _, err = client.Register(context.Background(), reqt) + if err != nil { + return err + } + return nil +} + +// GetDevicePluginOptions returns the values of the optional settings for this plugin +func (plugin *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { + options := &pluginapi.DevicePluginOptions{ + GetPreferredAllocationAvailable: true, + } + return options, nil +} + +// ListAndWatch lists devices and update that list according to the health status +func (plugin *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { + if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil { + return err + } + + for { + select { + case <-plugin.stop: + return nil + case d := <-plugin.health: + // FIXME: there is no way to recover from the Unhealthy state. + d.Health = pluginapi.Unhealthy + klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID) + if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil { + return nil + } + } + } +} + +// GetPreferredAllocation returns the preferred allocation from the set of devices specified in the request +func (plugin *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { + response := &pluginapi.PreferredAllocationResponse{} + for _, req := range r.ContainerRequests { + devices, err := plugin.rm.GetPreferredAllocation(req.AvailableDeviceIDs, req.MustIncludeDeviceIDs, int(req.AllocationSize)) + if err != nil { + return nil, fmt.Errorf("error getting list of preferred allocation devices: %v", err) + } + + resp := &pluginapi.ContainerPreferredAllocationResponse{ + DeviceIDs: devices, + } + + response.ContainerResponses = append(response.ContainerResponses, resp) + } + return response, nil +} + +// Allocate which return list of devices. +func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { + responses := pluginapi.AllocateResponse{} + + nodeName := os.Getenv(util.NodeNameEnvName) + current, err := util.GetPendingPod(ctx, nodeName) + if err != nil { + return &responses, err + } + + for idx, req := range reqs.ContainerRequests { + if err := plugin.rm.ValidateRequest(req.DevicesIDs); err != nil { + return nil, fmt.Errorf("invalid allocation request for %q: %w", plugin.rm.Resource(), err) + } + currentCtr, devreq, err := GetNextDeviceRequest(nvidia.NvidiaGPUDevice, *current) + klog.Infoln("deviceAllocateFromAnnotation=", devreq) + if err != nil { + device.PodAllocationFailed(nodeName, current, NodeLockNvidia) + return &responses, err + } + if len(devreq) != len(reqs.ContainerRequests[idx].DevicesIDs) { + device.PodAllocationFailed(nodeName, current, NodeLockNvidia) + return &responses, errors.New("device number not matched") + } + response, err := plugin.getAllocateResponse(plugin.GetContainerDeviceStrArray(devreq)) + if err != nil { + return nil, fmt.Errorf("failed to get allocate response: %v", err) + } + + err = EraseNextDeviceTypeFromAnnotation(nvidia.NvidiaGPUDevice, *current) + if err != nil { + device.PodAllocationFailed(nodeName, current, NodeLockNvidia) + return &responses, err + } + + if plugin.operatingMode != "mig" { + for i, dev := range devreq { + limitKey := fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v", i) + response.Envs[limitKey] = fmt.Sprintf("%vm", dev.Usedmem) + } + response.Envs["CUDA_DEVICE_SM_LIMIT"] = fmt.Sprint(devreq[0].Usedcores) + response.Envs["CUDA_DEVICE_MEMORY_SHARED_CACHE"] = fmt.Sprintf("%s/vgpu/%v.cache", hostHookPath, uuid.New().String()) + if plugin.schedulerConfig.DeviceMemoryScaling > 1 { + response.Envs["CUDA_OVERSUBSCRIBE"] = "true" + } + if plugin.schedulerConfig.DisableCoreLimit { + response.Envs[util.CoreLimitSwitch] = "disable" + } + cacheFileHostDirectory := fmt.Sprintf("%s/vgpu/containers/%s_%s", hostHookPath, current.UID, currentCtr.Name) + os.RemoveAll(cacheFileHostDirectory) + + os.MkdirAll(cacheFileHostDirectory, 0777) + os.Chmod(cacheFileHostDirectory, 0777) + os.MkdirAll("/tmp/vgpulock", 0777) + os.Chmod("/tmp/vgpulock", 0777) + response.Mounts = append(response.Mounts, + &pluginapi.Mount{ContainerPath: fmt.Sprintf("%s/vgpu/libvgpu.so", hostHookPath), + HostPath: GetLibPath(), + ReadOnly: true}, + &pluginapi.Mount{ContainerPath: fmt.Sprintf("%s/vgpu", hostHookPath), + HostPath: cacheFileHostDirectory, + ReadOnly: false}, + &pluginapi.Mount{ContainerPath: "/tmp/vgpulock", + HostPath: "/tmp/vgpulock", + ReadOnly: false}, + ) + found := false + for _, val := range currentCtr.Env { + if strings.Compare(val.Name, "CUDA_DISABLE_CONTROL") == 0 { + // if env existed but is set to false or can not be parsed, ignore + t, _ := strconv.ParseBool(val.Value) + if !t { + continue + } + // only env existed and set to true, we mark it "found" + found = true + break + } + } + if !found { + response.Mounts = append(response.Mounts, &pluginapi.Mount{ContainerPath: "/etc/ld.so.preload", + HostPath: hostHookPath + "/vgpu/ld.so.preload", + ReadOnly: true}, + ) + } + _, err = os.Stat(fmt.Sprintf("%s/vgpu/license", hostHookPath)) + if err == nil { + response.Mounts = append(response.Mounts, &pluginapi.Mount{ + ContainerPath: "/tmp/license", + HostPath: fmt.Sprintf("%s/vgpu/license", hostHookPath), + ReadOnly: true, + }) + response.Mounts = append(response.Mounts, &pluginapi.Mount{ + ContainerPath: "/usr/bin/vgpuvalidator", + HostPath: fmt.Sprintf("%s/vgpu/vgpuvalidator", hostHookPath), + ReadOnly: true, + }) + } + } + responses.ContainerResponses = append(responses.ContainerResponses, response) + } + + return &responses, nil +} + +func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*pluginapi.ContainerAllocateResponse, error) { + deviceIDs := plugin.deviceIDsFromAnnotatedDeviceIDs(requestIds) + + // Create an empty response that will be updated as required below. + response := &pluginapi.ContainerAllocateResponse{ + Envs: make(map[string]string), + } + if plugin.deviceListStrategies.AnyCDIEnabled() { + responseID := uuid.New().String() + if err := plugin.updateResponseForCDI(response, responseID, deviceIDs...); err != nil { + return nil, fmt.Errorf("failed to get allocate response for CDI: %v", err) + } + } + if plugin.mps.enabled { + plugin.updateResponseForMPS(response) + } + + // The following modifications are only made if at least one non-CDI device + // list strategy is selected. + if plugin.deviceListStrategies.AllCDIEnabled() { + return response, nil + } + + if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyEnvVar) { + plugin.updateResponseForDeviceListEnvVar(response, deviceIDs...) + plugin.updateResponseForImexChannelsEnvVar(response) + } + if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyVolumeMounts) { + plugin.updateResponseForDeviceMounts(response, deviceIDs...) + } + if *plugin.config.Flags.Plugin.PassDeviceSpecs { + response.Devices = append(response.Devices, plugin.apiDeviceSpecs(*plugin.config.Flags.NvidiaDevRoot, requestIds)...) + } + if *plugin.config.Flags.GDSEnabled { + response.Envs["NVIDIA_GDS"] = "enabled" + } + if *plugin.config.Flags.MOFEDEnabled { + response.Envs["NVIDIA_MOFED"] = "enabled" + } + return response, nil +} + +// updateResponseForMPS ensures that the ContainerAllocate response contains the information required to use MPS. +// This includes per-resource pipe and log directories as well as a global daemon-specific shm +// and assumes that an MPS control daemon has already been started. +func (plugin NvidiaDevicePlugin) updateResponseForMPS(response *pluginapi.ContainerAllocateResponse) { + plugin.mps.updateReponse(response) +} + +// updateResponseForCDI updates the specified response for the given device IDs. +// This response contains the annotations required to trigger CDI injection in the container engine or nvidia-container-runtime. +func (plugin *NvidiaDevicePlugin) updateResponseForCDI(response *pluginapi.ContainerAllocateResponse, responseID string, deviceIDs ...string) error { + var devices []string + for _, id := range deviceIDs { + devices = append(devices, plugin.cdiHandler.QualifiedName("gpu", id)) + } + for _, channel := range plugin.imexChannels { + devices = append(devices, plugin.cdiHandler.QualifiedName("imex-channel", channel.ID)) + } + if *plugin.config.Flags.GDSEnabled { + devices = append(devices, plugin.cdiHandler.QualifiedName("gds", "all")) + } + if *plugin.config.Flags.MOFEDEnabled { + devices = append(devices, plugin.cdiHandler.QualifiedName("mofed", "all")) + } + + if len(devices) == 0 { + return nil + } + + if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyCDIAnnotations) { + annotations, err := plugin.getCDIDeviceAnnotations(responseID, devices...) + if err != nil { + return err + } + response.Annotations = annotations + } + if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyCDICRI) { + for _, device := range devices { + cdiDevice := pluginapi.CDIDevice{ + Name: device, + } + response.CDIDevices = append(response.CDIDevices, &cdiDevice) + } + } + + return nil +} + +func (plugin *NvidiaDevicePlugin) getCDIDeviceAnnotations(id string, devices ...string) (map[string]string, error) { + annotations, err := cdiapi.UpdateAnnotations(map[string]string{}, "nvidia-device-plugin", id, devices) + if err != nil { + return nil, fmt.Errorf("failed to add CDI annotations: %v", err) + } + + if plugin.cdiAnnotationPrefix == spec.DefaultCDIAnnotationPrefix { + return annotations, nil + } + + // update annotations if a custom CDI prefix is configured + updatedAnnotations := make(map[string]string) + for k, v := range annotations { + newKey := plugin.cdiAnnotationPrefix + strings.TrimPrefix(k, spec.DefaultCDIAnnotationPrefix) + updatedAnnotations[newKey] = v + } + + return updatedAnnotations, nil +} + +// PreStartContainer is unimplemented for this plugin +func (plugin *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { + return &pluginapi.PreStartContainerResponse{}, nil +} + +// dial establishes the gRPC communication with the registered device plugin. +func (plugin *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + //nolint:staticcheck // TODO: Switch to grpc.NewClient + c, err := grpc.DialContext(ctx, unixSocketPath, + grpc.WithTransportCredentials(insecure.NewCredentials()), + //nolint:staticcheck // TODO: WithBlock is deprecated. + grpc.WithBlock(), + grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { + return (&net.Dialer{}).DialContext(ctx, "unix", addr) + }), + ) + if err != nil { + return nil, err + } + + return c, nil +} + +func (plugin *NvidiaDevicePlugin) deviceIDsFromAnnotatedDeviceIDs(ids []string) []string { + var deviceIDs []string + if *plugin.config.Flags.Plugin.DeviceIDStrategy == spec.DeviceIDStrategyUUID { + deviceIDs = rm.AnnotatedIDs(ids).GetIDs() + } + if *plugin.config.Flags.Plugin.DeviceIDStrategy == spec.DeviceIDStrategyIndex { + deviceIDs = plugin.rm.Devices().Subset(ids).GetIndices() + } + return deviceIDs +} + +func (plugin *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device { + return plugin.rm.Devices().GetPluginDevices() +} + +// updateResponseForDeviceListEnvVar sets the environment variable for the requested devices. +func (plugin *NvidiaDevicePlugin) updateResponseForDeviceListEnvVar(response *pluginapi.ContainerAllocateResponse, deviceIDs ...string) { + response.Envs[deviceListEnvVar] = strings.Join(deviceIDs, ",") +} + +// updateResponseForImexChannelsEnvVar sets the environment variable for the requested IMEX channels. +func (plugin *NvidiaDevicePlugin) updateResponseForImexChannelsEnvVar(response *pluginapi.ContainerAllocateResponse) { + var channelIDs []string + for _, channel := range plugin.imexChannels { + channelIDs = append(channelIDs, channel.ID) + } + if len(channelIDs) > 0 { + response.Envs[spec.ImexChannelEnvVar] = strings.Join(channelIDs, ",") + } +} + +// updateResponseForDeviceMounts sets the mounts required to request devices if volume mounts are used. +func (plugin *NvidiaDevicePlugin) updateResponseForDeviceMounts(response *pluginapi.ContainerAllocateResponse, deviceIDs ...string) { + plugin.updateResponseForDeviceListEnvVar(response, deviceListAsVolumeMountsContainerPathRoot) + + for _, id := range deviceIDs { + mount := &pluginapi.Mount{ + HostPath: deviceListAsVolumeMountsHostPath, + ContainerPath: filepath.Join(deviceListAsVolumeMountsContainerPathRoot, id), + } + response.Mounts = append(response.Mounts, mount) + } + for _, channel := range plugin.imexChannels { + mount := &pluginapi.Mount{ + HostPath: deviceListAsVolumeMountsHostPath, + ContainerPath: filepath.Join(deviceListAsVolumeMountsContainerPathRoot, "imex", channel.ID), + } + response.Mounts = append(response.Mounts, mount) + } +} + +func (plugin *NvidiaDevicePlugin) apiDeviceSpecs(devRoot string, ids []string) []*pluginapi.DeviceSpec { + optional := map[string]bool{ + "/dev/nvidiactl": true, + "/dev/nvidia-uvm": true, + "/dev/nvidia-uvm-tools": true, + "/dev/nvidia-modeset": true, + } + + paths := plugin.rm.GetDevicePaths(ids) + + var specs []*pluginapi.DeviceSpec + for _, p := range paths { + if optional[p] { + if _, err := os.Stat(p); err != nil { + continue + } + } + spec := &pluginapi.DeviceSpec{ + ContainerPath: p, + HostPath: filepath.Join(devRoot, p), + Permissions: "rw", + } + specs = append(specs, spec) + } + + for _, channel := range plugin.imexChannels { + spec := &pluginapi.DeviceSpec{ + ContainerPath: channel.Path, + // TODO: The HostPath property for a channel is not the correct value to use here. + // The `devRoot` there represents the devRoot in the current container when discovering devices + // and is set to "{{ .*config.Flags.Plugin.ContainerDriverRoot }}/dev". + // The devRoot in this context is the {{ .config.Flags.NvidiaDevRoot }} and defines the + // root for device nodes on the host. This is usually / or /run/nvidia/driver when the + // driver container is used. + HostPath: filepath.Join(devRoot, channel.Path), + Permissions: "rw", + } + specs = append(specs, spec) + } + + return specs +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server_test.go b/pkg/nvidia-plugin/pkg/plugin/server_test.go similarity index 58% rename from pkg/device-plugin/nvidiadevice/nvinternal/plugin/server_test.go rename to pkg/nvidia-plugin/pkg/plugin/server_test.go index 1574b3ff7..cce940138 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server_test.go +++ b/pkg/nvidia-plugin/pkg/plugin/server_test.go @@ -1,46 +1,31 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ package plugin import ( - "fmt" "testing" - v1 "github.com/NVIDIA/k8s-device-plugin/api/config/v1" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" "github.com/stretchr/testify/require" - kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + v1 "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/cdi" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/imex" ) func TestCDIAllocateResponse(t *testing.T) { @@ -49,31 +34,22 @@ func TestCDIAllocateResponse(t *testing.T) { deviceIds []string deviceListStrategies []string CDIPrefix string - CDIEnabled bool GDSEnabled bool MOFEDEnabled bool - expectedResponse kubeletdevicepluginv1beta1.ContainerAllocateResponse + imexChannels []*imex.Channel + expectedResponse pluginapi.ContainerAllocateResponse }{ { description: "empty device list has empty response", deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "cdi.k8s.io/", - CDIEnabled: true, - }, - { - description: "CDI disabled has empty response", - deviceIds: []string{"gpu0"}, - deviceListStrategies: []string{"cdi-annotations"}, - CDIPrefix: "cdi.k8s.io/", - CDIEnabled: false, }, { description: "single device is added to annotations", deviceIds: []string{"gpu0"}, deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "cdi.k8s.io/", - CDIEnabled: true, - expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + expectedResponse: pluginapi.ContainerAllocateResponse{ Annotations: map[string]string{ "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0", }, @@ -84,8 +60,7 @@ func TestCDIAllocateResponse(t *testing.T) { deviceIds: []string{"gpu0"}, deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "custom.cdi.k8s.io/", - CDIEnabled: true, - expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + expectedResponse: pluginapi.ContainerAllocateResponse{ Annotations: map[string]string{ "custom.cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0", }, @@ -96,8 +71,7 @@ func TestCDIAllocateResponse(t *testing.T) { deviceIds: []string{"gpu0", "gpu1"}, deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "cdi.k8s.io/", - CDIEnabled: true, - expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + expectedResponse: pluginapi.ContainerAllocateResponse{ Annotations: map[string]string{ "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0,nvidia.com/gpu=gpu1", }, @@ -108,8 +82,7 @@ func TestCDIAllocateResponse(t *testing.T) { deviceIds: []string{"gpu0", "gpu1"}, deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "custom.cdi.k8s.io/", - CDIEnabled: true, - expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + expectedResponse: pluginapi.ContainerAllocateResponse{ Annotations: map[string]string{ "custom.cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0,nvidia.com/gpu=gpu1", }, @@ -119,9 +92,8 @@ func TestCDIAllocateResponse(t *testing.T) { description: "mofed devices are selected if configured", deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "cdi.k8s.io/", - CDIEnabled: true, MOFEDEnabled: true, - expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + expectedResponse: pluginapi.ContainerAllocateResponse{ Annotations: map[string]string{ "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/mofed=all", }, @@ -131,9 +103,8 @@ func TestCDIAllocateResponse(t *testing.T) { description: "gds devices are selected if configured", deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "cdi.k8s.io/", - CDIEnabled: true, GDSEnabled: true, - expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + expectedResponse: pluginapi.ContainerAllocateResponse{ Annotations: map[string]string{ "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gds=all", }, @@ -144,18 +115,29 @@ func TestCDIAllocateResponse(t *testing.T) { deviceIds: []string{"gpu0"}, deviceListStrategies: []string{"cdi-annotations"}, CDIPrefix: "cdi.k8s.io/", - CDIEnabled: true, GDSEnabled: true, MOFEDEnabled: true, - expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + expectedResponse: pluginapi.ContainerAllocateResponse{ Annotations: map[string]string{ "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0,nvidia.com/gds=all,nvidia.com/mofed=all", }, }, }, + { + description: "imex channel is included with devices", + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + imexChannels: []*imex.Channel{{ID: "0"}}, + expectedResponse: pluginapi.ContainerAllocateResponse{ + Annotations: map[string]string{ + "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/imex-channel=0", + }, + }, + }, } - for _, tc := range testCases { + for i := range testCases { + tc := testCases[i] t.Run(tc.description, func(t *testing.T) { deviceListStrategies, _ := v1.NewDeviceListStrategies(tc.deviceListStrategies) plugin := NvidiaDevicePlugin{ @@ -174,27 +156,16 @@ func TestCDIAllocateResponse(t *testing.T) { return "nvidia.com/" + c + "=" + s }, }, - cdiEnabled: tc.CDIEnabled, deviceListStrategies: deviceListStrategies, cdiAnnotationPrefix: tc.CDIPrefix, + imexChannels: tc.imexChannels, } - response, err := plugin.getAllocateResponseForCDI("uuid", tc.deviceIds) + response := pluginapi.ContainerAllocateResponse{} + err := plugin.updateResponseForCDI(&response, "uuid", tc.deviceIds...) require.Nil(t, err) require.EqualValues(t, &tc.expectedResponse, &response) }) } } - -func Test_pathGeneration(t *testing.T) { - hostHookPath := "/usr/local/vgpu" - uid := "testuid" - cname := "testcname" - expected := "/usr/local/vgpu/containers/testuid_testcname" - result := fmt.Sprintf("%s/containers/%s_%s", hostHookPath, uid, cname) - - if expected != result { - t.Errorf("Expected %s, got %s", expected, result) - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/util.go b/pkg/nvidia-plugin/pkg/plugin/util.go similarity index 99% rename from pkg/device-plugin/nvidiadevice/nvinternal/plugin/util.go rename to pkg/nvidia-plugin/pkg/plugin/util.go index d0bd35379..2b16900b3 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/util.go +++ b/pkg/nvidia-plugin/pkg/plugin/util.go @@ -29,7 +29,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/klog/v2" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/info" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/info" "github.com/Project-HAMi/HAMi/pkg/util" ) diff --git a/pkg/nvidia-plugin/pkg/resource/cuda-device.go b/pkg/nvidia-plugin/pkg/resource/cuda-device.go new file mode 100644 index 000000000..e8c7f28c0 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/cuda-device.go @@ -0,0 +1,110 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/cuda" +) + +type cudaDevice cuda.Device + +var _ Device = (*cudaDevice)(nil) + +// NewCudaDevice constructs a new CUDA device +func NewCudaDevice(d cuda.Device) Device { + device := cudaDevice(d) + return &device +} + +// GetAttributes is unsupported for CUDA devices +func (d *cudaDevice) GetAttributes() (map[string]interface{}, error) { + return nil, fmt.Errorf("GetAttributes is not supported for CUDA devices") +} + +// GetCudaComputeCapability returns the CUDA Compute Capability major and minor versions. +// If the device is a MIG device (i.e. a compute instance) these are 0 +func (d *cudaDevice) GetCudaComputeCapability() (int, int, error) { + major, r := cuda.Device(*d).GetAttribute(cuda.COMPUTE_CAPABILITY_MAJOR) + if r != cuda.SUCCESS { + return 0, 0, fmt.Errorf("failed to get CUDA compute capability major for device: result=%v", r) + } + + minor, r := cuda.Device(*d).GetAttribute(cuda.COMPUTE_CAPABILITY_MINOR) + if r != cuda.SUCCESS { + return 0, 0, fmt.Errorf("failed to get CUDA compute capability minor for device: result=%v", r) + } + + return major, minor, nil +} + +// GetDeviceHandleFromMigDeviceHandle is unsupported for CUDA devices +func (d *cudaDevice) GetDeviceHandleFromMigDeviceHandle() (Device, error) { + return nil, fmt.Errorf("GetDeviceHandleFromMigDeviceHandle is unsupported for CUDA devices") +} + +// GetTotalMemoryMB returns the total memory for a device +func (d *cudaDevice) GetTotalMemoryMB() (uint64, error) { + total, r := cuda.Device(*d).TotalMem() + if r != cuda.SUCCESS { + return 0, fmt.Errorf("failed to get memory info for device: %v", r) + } + return total / (1024 * 1024), nil +} + +// GetMigDevices is unsupported for CUDA devices +func (d *cudaDevice) GetMigDevices() ([]Device, error) { + return nil, fmt.Errorf("GetMigDevices is unsupported for CUDA devices") +} + +// GetName returns the device name / model. +func (d *cudaDevice) GetName() (string, error) { + name, r := cuda.Device(*d).GetName() + if r != cuda.SUCCESS { + return "", fmt.Errorf("failed to get device name: %v", r) + } + + return name, nil +} + +// GetUUID is unsupported for CUDA devices +func (d *cudaDevice) GetUUID() (string, error) { + return "", fmt.Errorf("GetUUID is unsupported for CUDA devices") +} + +// IsMigCapable always returns false for CUDA devices +func (d *cudaDevice) IsMigCapable() (bool, error) { + return false, nil +} + +// IsMigEnabled always returns false for CUDA devices +func (d *cudaDevice) IsMigEnabled() (bool, error) { + return false, nil +} + +func (d *cudaDevice) GetPCIClass() (uint32, error) { + return 0, nil +} + +func (d *cudaDevice) IsFabricAttached() (bool, error) { + return false, nil +} + +func (d *cudaDevice) GetFabricIDs() (string, string, error) { + return "", "", fmt.Errorf("GetFabricIDs is not supported for CUDA devices") +} diff --git a/pkg/nvidia-plugin/pkg/resource/cuda-lib.go b/pkg/nvidia-plugin/pkg/resource/cuda-lib.go new file mode 100644 index 000000000..0dc045a8d --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/cuda-lib.go @@ -0,0 +1,88 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/cuda" +) + +type cudaLib struct{} + +var _ Manager = (*cudaLib)(nil) + +// NewCudaManager returns an resource manger for CUDA devices +func NewCudaManager() Manager { + return &cudaLib{} +} + +// GetDevices returns the CUDA devices available on the system +func (l *cudaLib) GetDevices() ([]Device, error) { + count, r := cuda.DeviceGetCount() + if r != cuda.SUCCESS { + return nil, fmt.Errorf("failed to get number of CUDA devices: %v", r) + } + + var devices []Device + for i := 0; i < count; i++ { + d, r := cuda.DeviceGet(i) + if r != cuda.SUCCESS { + return nil, fmt.Errorf("failed to get CUDA device %v: %v", i, r) + } + devices = append(devices, NewCudaDevice(d)) + } + + return devices, nil +} + +// GetCudaDriverVersion returns the CUDA driver version +func (l *cudaLib) GetCudaDriverVersion() (int, int, error) { + version, r := cuda.DriverGetVersion() + if r != cuda.SUCCESS { + return 0, 0, fmt.Errorf("failed to get driver version: %v", r) + } + + major := version / 1000 + minor := version % 100 / 10 + + return major, minor, nil +} + +// GetDriverVersion returns the driver version. +// This is currently "unknown" for Tegra systems. +func (l *cudaLib) GetDriverVersion() (string, error) { + return "unknown.unknown.unknown", nil +} + +// Init initializes the CUDA library. +func (l *cudaLib) Init() error { + r := cuda.Init() + if r != cuda.SUCCESS { + return fmt.Errorf("%v", r) + } + return nil +} + +// Shutdown shuts down the CUDA library. +func (l *cudaLib) Shutdown() (err error) { + r := cuda.Shutdown() + if r != cuda.SUCCESS { + return fmt.Errorf("%v", r) + } + return nil +} diff --git a/pkg/nvidia-plugin/pkg/resource/device_mock.go b/pkg/nvidia-plugin/pkg/resource/device_mock.go new file mode 100644 index 000000000..eadc8932b --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/device_mock.go @@ -0,0 +1,437 @@ +// Code generated by moq; DO NOT EDIT. +// github.com/matryer/moq + +package resource + +import ( + "sync" +) + +// Ensure, that DeviceMock does implement Device. +// If this is not the case, regenerate this file with moq. +var _ Device = &DeviceMock{} + +// DeviceMock is a mock implementation of Device. +// +// func TestSomethingThatUsesDevice(t *testing.T) { +// +// // make and configure a mocked Device +// mockedDevice := &DeviceMock{ +// GetAttributesFunc: func() (map[string]interface{}, error) { +// panic("mock out the GetAttributes method") +// }, +// GetCudaComputeCapabilityFunc: func() (int, int, error) { +// panic("mock out the GetCudaComputeCapability method") +// }, +// GetDeviceHandleFromMigDeviceHandleFunc: func() (Device, error) { +// panic("mock out the GetDeviceHandleFromMigDeviceHandle method") +// }, +// GetFabricIDsFunc: func() (string, string, error) { +// panic("mock out the GetFabricIDs method") +// }, +// GetMigDevicesFunc: func() ([]Device, error) { +// panic("mock out the GetMigDevices method") +// }, +// GetNameFunc: func() (string, error) { +// panic("mock out the GetName method") +// }, +// GetPCIClassFunc: func() (uint32, error) { +// panic("mock out the GetPCIClass method") +// }, +// GetTotalMemoryMBFunc: func() (uint64, error) { +// panic("mock out the GetTotalMemoryMB method") +// }, +// IsFabricAttachedFunc: func() (bool, error) { +// panic("mock out the IsFabricAttached method") +// }, +// IsMigCapableFunc: func() (bool, error) { +// panic("mock out the IsMigCapable method") +// }, +// IsMigEnabledFunc: func() (bool, error) { +// panic("mock out the IsMigEnabled method") +// }, +// } +// +// // use mockedDevice in code that requires Device +// // and then make assertions. +// +// } +type DeviceMock struct { + // GetAttributesFunc mocks the GetAttributes method. + GetAttributesFunc func() (map[string]interface{}, error) + + // GetCudaComputeCapabilityFunc mocks the GetCudaComputeCapability method. + GetCudaComputeCapabilityFunc func() (int, int, error) + + // GetDeviceHandleFromMigDeviceHandleFunc mocks the GetDeviceHandleFromMigDeviceHandle method. + GetDeviceHandleFromMigDeviceHandleFunc func() (Device, error) + + // GetFabricIDsFunc mocks the GetFabricIDs method. + GetFabricIDsFunc func() (string, string, error) + + // GetMigDevicesFunc mocks the GetMigDevices method. + GetMigDevicesFunc func() ([]Device, error) + + // GetNameFunc mocks the GetName method. + GetNameFunc func() (string, error) + + // GetPCIClassFunc mocks the GetPCIClass method. + GetPCIClassFunc func() (uint32, error) + + // GetTotalMemoryMBFunc mocks the GetTotalMemoryMB method. + GetTotalMemoryMBFunc func() (uint64, error) + + // IsFabricAttachedFunc mocks the IsFabricAttached method. + IsFabricAttachedFunc func() (bool, error) + + // IsMigCapableFunc mocks the IsMigCapable method. + IsMigCapableFunc func() (bool, error) + + // IsMigEnabledFunc mocks the IsMigEnabled method. + IsMigEnabledFunc func() (bool, error) + + // calls tracks calls to the methods. + calls struct { + // GetAttributes holds details about calls to the GetAttributes method. + GetAttributes []struct { + } + // GetCudaComputeCapability holds details about calls to the GetCudaComputeCapability method. + GetCudaComputeCapability []struct { + } + // GetDeviceHandleFromMigDeviceHandle holds details about calls to the GetDeviceHandleFromMigDeviceHandle method. + GetDeviceHandleFromMigDeviceHandle []struct { + } + // GetFabricIDs holds details about calls to the GetFabricIDs method. + GetFabricIDs []struct { + } + // GetMigDevices holds details about calls to the GetMigDevices method. + GetMigDevices []struct { + } + // GetName holds details about calls to the GetName method. + GetName []struct { + } + // GetPCIClass holds details about calls to the GetPCIClass method. + GetPCIClass []struct { + } + // GetTotalMemoryMB holds details about calls to the GetTotalMemoryMB method. + GetTotalMemoryMB []struct { + } + // IsFabricAttached holds details about calls to the IsFabricAttached method. + IsFabricAttached []struct { + } + // IsMigCapable holds details about calls to the IsMigCapable method. + IsMigCapable []struct { + } + // IsMigEnabled holds details about calls to the IsMigEnabled method. + IsMigEnabled []struct { + } + } + lockGetAttributes sync.RWMutex + lockGetCudaComputeCapability sync.RWMutex + lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex + lockGetFabricIDs sync.RWMutex + lockGetMigDevices sync.RWMutex + lockGetName sync.RWMutex + lockGetPCIClass sync.RWMutex + lockGetTotalMemoryMB sync.RWMutex + lockIsFabricAttached sync.RWMutex + lockIsMigCapable sync.RWMutex + lockIsMigEnabled sync.RWMutex +} + +// GetAttributes calls GetAttributesFunc. +func (mock *DeviceMock) GetAttributes() (map[string]interface{}, error) { + if mock.GetAttributesFunc == nil { + panic("DeviceMock.GetAttributesFunc: method is nil but Device.GetAttributes was just called") + } + callInfo := struct { + }{} + mock.lockGetAttributes.Lock() + mock.calls.GetAttributes = append(mock.calls.GetAttributes, callInfo) + mock.lockGetAttributes.Unlock() + return mock.GetAttributesFunc() +} + +// GetAttributesCalls gets all the calls that were made to GetAttributes. +// Check the length with: +// +// len(mockedDevice.GetAttributesCalls()) +func (mock *DeviceMock) GetAttributesCalls() []struct { +} { + var calls []struct { + } + mock.lockGetAttributes.RLock() + calls = mock.calls.GetAttributes + mock.lockGetAttributes.RUnlock() + return calls +} + +// GetCudaComputeCapability calls GetCudaComputeCapabilityFunc. +func (mock *DeviceMock) GetCudaComputeCapability() (int, int, error) { + if mock.GetCudaComputeCapabilityFunc == nil { + panic("DeviceMock.GetCudaComputeCapabilityFunc: method is nil but Device.GetCudaComputeCapability was just called") + } + callInfo := struct { + }{} + mock.lockGetCudaComputeCapability.Lock() + mock.calls.GetCudaComputeCapability = append(mock.calls.GetCudaComputeCapability, callInfo) + mock.lockGetCudaComputeCapability.Unlock() + return mock.GetCudaComputeCapabilityFunc() +} + +// GetCudaComputeCapabilityCalls gets all the calls that were made to GetCudaComputeCapability. +// Check the length with: +// +// len(mockedDevice.GetCudaComputeCapabilityCalls()) +func (mock *DeviceMock) GetCudaComputeCapabilityCalls() []struct { +} { + var calls []struct { + } + mock.lockGetCudaComputeCapability.RLock() + calls = mock.calls.GetCudaComputeCapability + mock.lockGetCudaComputeCapability.RUnlock() + return calls +} + +// GetDeviceHandleFromMigDeviceHandle calls GetDeviceHandleFromMigDeviceHandleFunc. +func (mock *DeviceMock) GetDeviceHandleFromMigDeviceHandle() (Device, error) { + if mock.GetDeviceHandleFromMigDeviceHandleFunc == nil { + panic("DeviceMock.GetDeviceHandleFromMigDeviceHandleFunc: method is nil but Device.GetDeviceHandleFromMigDeviceHandle was just called") + } + callInfo := struct { + }{} + mock.lockGetDeviceHandleFromMigDeviceHandle.Lock() + mock.calls.GetDeviceHandleFromMigDeviceHandle = append(mock.calls.GetDeviceHandleFromMigDeviceHandle, callInfo) + mock.lockGetDeviceHandleFromMigDeviceHandle.Unlock() + return mock.GetDeviceHandleFromMigDeviceHandleFunc() +} + +// GetDeviceHandleFromMigDeviceHandleCalls gets all the calls that were made to GetDeviceHandleFromMigDeviceHandle. +// Check the length with: +// +// len(mockedDevice.GetDeviceHandleFromMigDeviceHandleCalls()) +func (mock *DeviceMock) GetDeviceHandleFromMigDeviceHandleCalls() []struct { +} { + var calls []struct { + } + mock.lockGetDeviceHandleFromMigDeviceHandle.RLock() + calls = mock.calls.GetDeviceHandleFromMigDeviceHandle + mock.lockGetDeviceHandleFromMigDeviceHandle.RUnlock() + return calls +} + +// GetFabricIDs calls GetFabricIDsFunc. +func (mock *DeviceMock) GetFabricIDs() (string, string, error) { + if mock.GetFabricIDsFunc == nil { + panic("DeviceMock.GetFabricIDsFunc: method is nil but Device.GetFabricIDs was just called") + } + callInfo := struct { + }{} + mock.lockGetFabricIDs.Lock() + mock.calls.GetFabricIDs = append(mock.calls.GetFabricIDs, callInfo) + mock.lockGetFabricIDs.Unlock() + return mock.GetFabricIDsFunc() +} + +// GetFabricIDsCalls gets all the calls that were made to GetFabricIDs. +// Check the length with: +// +// len(mockedDevice.GetFabricIDsCalls()) +func (mock *DeviceMock) GetFabricIDsCalls() []struct { +} { + var calls []struct { + } + mock.lockGetFabricIDs.RLock() + calls = mock.calls.GetFabricIDs + mock.lockGetFabricIDs.RUnlock() + return calls +} + +// GetMigDevices calls GetMigDevicesFunc. +func (mock *DeviceMock) GetMigDevices() ([]Device, error) { + if mock.GetMigDevicesFunc == nil { + panic("DeviceMock.GetMigDevicesFunc: method is nil but Device.GetMigDevices was just called") + } + callInfo := struct { + }{} + mock.lockGetMigDevices.Lock() + mock.calls.GetMigDevices = append(mock.calls.GetMigDevices, callInfo) + mock.lockGetMigDevices.Unlock() + return mock.GetMigDevicesFunc() +} + +// GetMigDevicesCalls gets all the calls that were made to GetMigDevices. +// Check the length with: +// +// len(mockedDevice.GetMigDevicesCalls()) +func (mock *DeviceMock) GetMigDevicesCalls() []struct { +} { + var calls []struct { + } + mock.lockGetMigDevices.RLock() + calls = mock.calls.GetMigDevices + mock.lockGetMigDevices.RUnlock() + return calls +} + +// GetName calls GetNameFunc. +func (mock *DeviceMock) GetName() (string, error) { + if mock.GetNameFunc == nil { + panic("DeviceMock.GetNameFunc: method is nil but Device.GetName was just called") + } + callInfo := struct { + }{} + mock.lockGetName.Lock() + mock.calls.GetName = append(mock.calls.GetName, callInfo) + mock.lockGetName.Unlock() + return mock.GetNameFunc() +} + +// GetNameCalls gets all the calls that were made to GetName. +// Check the length with: +// +// len(mockedDevice.GetNameCalls()) +func (mock *DeviceMock) GetNameCalls() []struct { +} { + var calls []struct { + } + mock.lockGetName.RLock() + calls = mock.calls.GetName + mock.lockGetName.RUnlock() + return calls +} + +// GetPCIClass calls GetPCIClassFunc. +func (mock *DeviceMock) GetPCIClass() (uint32, error) { + if mock.GetPCIClassFunc == nil { + panic("DeviceMock.GetPCIClassFunc: method is nil but Device.GetPCIClass was just called") + } + callInfo := struct { + }{} + mock.lockGetPCIClass.Lock() + mock.calls.GetPCIClass = append(mock.calls.GetPCIClass, callInfo) + mock.lockGetPCIClass.Unlock() + return mock.GetPCIClassFunc() +} + +// GetPCIClassCalls gets all the calls that were made to GetPCIClass. +// Check the length with: +// +// len(mockedDevice.GetPCIClassCalls()) +func (mock *DeviceMock) GetPCIClassCalls() []struct { +} { + var calls []struct { + } + mock.lockGetPCIClass.RLock() + calls = mock.calls.GetPCIClass + mock.lockGetPCIClass.RUnlock() + return calls +} + +// GetTotalMemoryMB calls GetTotalMemoryMBFunc. +func (mock *DeviceMock) GetTotalMemoryMB() (uint64, error) { + if mock.GetTotalMemoryMBFunc == nil { + panic("DeviceMock.GetTotalMemoryMBFunc: method is nil but Device.GetTotalMemoryMB was just called") + } + callInfo := struct { + }{} + mock.lockGetTotalMemoryMB.Lock() + mock.calls.GetTotalMemoryMB = append(mock.calls.GetTotalMemoryMB, callInfo) + mock.lockGetTotalMemoryMB.Unlock() + return mock.GetTotalMemoryMBFunc() +} + +// GetTotalMemoryMBCalls gets all the calls that were made to GetTotalMemoryMB. +// Check the length with: +// +// len(mockedDevice.GetTotalMemoryMBCalls()) +func (mock *DeviceMock) GetTotalMemoryMBCalls() []struct { +} { + var calls []struct { + } + mock.lockGetTotalMemoryMB.RLock() + calls = mock.calls.GetTotalMemoryMB + mock.lockGetTotalMemoryMB.RUnlock() + return calls +} + +// IsFabricAttached calls IsFabricAttachedFunc. +func (mock *DeviceMock) IsFabricAttached() (bool, error) { + if mock.IsFabricAttachedFunc == nil { + panic("DeviceMock.IsFabricAttachedFunc: method is nil but Device.IsFabricAttached was just called") + } + callInfo := struct { + }{} + mock.lockIsFabricAttached.Lock() + mock.calls.IsFabricAttached = append(mock.calls.IsFabricAttached, callInfo) + mock.lockIsFabricAttached.Unlock() + return mock.IsFabricAttachedFunc() +} + +// IsFabricAttachedCalls gets all the calls that were made to IsFabricAttached. +// Check the length with: +// +// len(mockedDevice.IsFabricAttachedCalls()) +func (mock *DeviceMock) IsFabricAttachedCalls() []struct { +} { + var calls []struct { + } + mock.lockIsFabricAttached.RLock() + calls = mock.calls.IsFabricAttached + mock.lockIsFabricAttached.RUnlock() + return calls +} + +// IsMigCapable calls IsMigCapableFunc. +func (mock *DeviceMock) IsMigCapable() (bool, error) { + if mock.IsMigCapableFunc == nil { + panic("DeviceMock.IsMigCapableFunc: method is nil but Device.IsMigCapable was just called") + } + callInfo := struct { + }{} + mock.lockIsMigCapable.Lock() + mock.calls.IsMigCapable = append(mock.calls.IsMigCapable, callInfo) + mock.lockIsMigCapable.Unlock() + return mock.IsMigCapableFunc() +} + +// IsMigCapableCalls gets all the calls that were made to IsMigCapable. +// Check the length with: +// +// len(mockedDevice.IsMigCapableCalls()) +func (mock *DeviceMock) IsMigCapableCalls() []struct { +} { + var calls []struct { + } + mock.lockIsMigCapable.RLock() + calls = mock.calls.IsMigCapable + mock.lockIsMigCapable.RUnlock() + return calls +} + +// IsMigEnabled calls IsMigEnabledFunc. +func (mock *DeviceMock) IsMigEnabled() (bool, error) { + if mock.IsMigEnabledFunc == nil { + panic("DeviceMock.IsMigEnabledFunc: method is nil but Device.IsMigEnabled was just called") + } + callInfo := struct { + }{} + mock.lockIsMigEnabled.Lock() + mock.calls.IsMigEnabled = append(mock.calls.IsMigEnabled, callInfo) + mock.lockIsMigEnabled.Unlock() + return mock.IsMigEnabledFunc() +} + +// IsMigEnabledCalls gets all the calls that were made to IsMigEnabled. +// Check the length with: +// +// len(mockedDevice.IsMigEnabledCalls()) +func (mock *DeviceMock) IsMigEnabledCalls() []struct { +} { + var calls []struct { + } + mock.lockIsMigEnabled.RLock() + calls = mock.calls.IsMigEnabled + mock.lockIsMigEnabled.RUnlock() + return calls +} diff --git a/pkg/nvidia-plugin/pkg/resource/factory.go b/pkg/nvidia-plugin/pkg/resource/factory.go new file mode 100644 index 000000000..88a1e1a05 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/factory.go @@ -0,0 +1,84 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +// NewManager is a factory method that creates a resource Manager based on the specified config. +func NewManager(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, config *spec.Config) (Manager, error) { + manager, err := getManager(infolib, nvmllib, devicelib, *config.Flags.DeviceDiscoveryStrategy) + if err != nil { + if *config.Flags.FailOnInitError { + return nil, err + } + klog.ErrorS(err, "using empty manager") + return NewNullManager(), nil + } + return WithConfig(manager, config), nil +} + +// WithConfig modifies a manager depending on the specified config. +// If failure on a call to init is allowed, the manager is wrapped to allow fallback to a Null manager. +func WithConfig(manager Manager, config *spec.Config) Manager { + if *config.Flags.FailOnInitError { + return manager + } + + return NewFallbackToNullOnInitError(manager) +} + +// getManager returns the resource manager depending on the system configuration. +func getManager(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, strategy string) (Manager, error) { + resolved := resolveMode(infolib, strategy) + switch resolved { + case "nvml": + klog.Info("Using NVML manager") + return NewNVMLManager(nvmllib, devicelib), nil + case "tegra": + klog.Info("Using CUDA manager") + return NewCudaManager(), nil + case "vfio": + klog.Info("Using Vfio manager") + return NewVfioManager(), nil + default: + return nil, fmt.Errorf("unsupported strategy %v", resolved) + } +} + +func resolveMode(infolib info.Interface, strategy string) string { + if strategy != "" && strategy != "auto" { + return strategy + } + + platform := infolib.ResolvePlatform() + switch platform { + case info.PlatformNVML, info.PlatformWSL: + return "nvml" + case info.PlatformTegra: + return "tegra" + } + return strategy +} diff --git a/pkg/nvidia-plugin/pkg/resource/fallback.go b/pkg/nvidia-plugin/pkg/resource/fallback.go new file mode 100644 index 000000000..03f7db67e --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/fallback.go @@ -0,0 +1,64 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "k8s.io/klog/v2" +) + +type withFallBack struct { + wraps Manager + fallback Manager +} + +// NewFallbackToNullOnInitError creates a manager that becomes a Null manager on the first Init error. +func NewFallbackToNullOnInitError(m Manager) Manager { + return &withFallBack{ + wraps: m, + fallback: NewNullManager(), + } +} + +// Init calls the Init function and if this does not succeed falls back to a Null manager. +func (m *withFallBack) Init() error { + err := m.wraps.Init() + if err != nil { + klog.Warningf("Failed to initialize resource manager: %v", err) + m.wraps = m.fallback + } + return nil +} + +// Shutdown delegates to the wrapped manager +func (m *withFallBack) Shutdown() (err error) { + return m.wraps.Shutdown() +} + +// GetDevices delegates to the wrapped manager +func (m *withFallBack) GetDevices() ([]Device, error) { + return m.wraps.GetDevices() +} + +// GetCudaDriverVersion delegates to the wrapped manager +func (m *withFallBack) GetCudaDriverVersion() (int, int, error) { + return m.wraps.GetCudaDriverVersion() +} + +// GetDriverVersion delegates to the wrapped manager +func (m *withFallBack) GetDriverVersion() (string, error) { + return m.wraps.GetDriverVersion() +} diff --git a/pkg/nvidia-plugin/pkg/resource/fallback_test.go b/pkg/nvidia-plugin/pkg/resource/fallback_test.go new file mode 100644 index 000000000..9a21fd852 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/fallback_test.go @@ -0,0 +1,62 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestFallback(t *testing.T) { + testCases := []struct { + initError error + shutdownError error + }{ + { + initError: fmt.Errorf("init failed"), + }, + { + shutdownError: fmt.Errorf("should not be called"), + }, + } + + for _, tc := range testCases { + + m := &ManagerMock{ + InitFunc: func() error { + return tc.initError + }, + ShutdownFunc: func() error { + return tc.shutdownError + }, + } + + f := NewFallbackToNullOnInitError(m) + + require.NoError(t, f.Init()) + + err := f.Shutdown() + if tc.shutdownError == nil { + require.NoError(t, err) + } else { + require.EqualError(t, err, tc.shutdownError.Error()) + } + + } +} diff --git a/pkg/nvidia-plugin/pkg/resource/manager_mock.go b/pkg/nvidia-plugin/pkg/resource/manager_mock.go new file mode 100644 index 000000000..3543e0a7b --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/manager_mock.go @@ -0,0 +1,215 @@ +// Code generated by moq; DO NOT EDIT. +// github.com/matryer/moq + +package resource + +import ( + "sync" +) + +// Ensure, that ManagerMock does implement Manager. +// If this is not the case, regenerate this file with moq. +var _ Manager = &ManagerMock{} + +// ManagerMock is a mock implementation of Manager. +// +// func TestSomethingThatUsesManager(t *testing.T) { +// +// // make and configure a mocked Manager +// mockedManager := &ManagerMock{ +// GetCudaDriverVersionFunc: func() (int, int, error) { +// panic("mock out the GetCudaDriverVersion method") +// }, +// GetDevicesFunc: func() ([]Device, error) { +// panic("mock out the GetDevices method") +// }, +// GetDriverVersionFunc: func() (string, error) { +// panic("mock out the GetDriverVersion method") +// }, +// InitFunc: func() error { +// panic("mock out the Init method") +// }, +// ShutdownFunc: func() error { +// panic("mock out the Shutdown method") +// }, +// } +// +// // use mockedManager in code that requires Manager +// // and then make assertions. +// +// } +type ManagerMock struct { + // GetCudaDriverVersionFunc mocks the GetCudaDriverVersion method. + GetCudaDriverVersionFunc func() (int, int, error) + + // GetDevicesFunc mocks the GetDevices method. + GetDevicesFunc func() ([]Device, error) + + // GetDriverVersionFunc mocks the GetDriverVersion method. + GetDriverVersionFunc func() (string, error) + + // InitFunc mocks the Init method. + InitFunc func() error + + // ShutdownFunc mocks the Shutdown method. + ShutdownFunc func() error + + // calls tracks calls to the methods. + calls struct { + // GetCudaDriverVersion holds details about calls to the GetCudaDriverVersion method. + GetCudaDriverVersion []struct { + } + // GetDevices holds details about calls to the GetDevices method. + GetDevices []struct { + } + // GetDriverVersion holds details about calls to the GetDriverVersion method. + GetDriverVersion []struct { + } + // Init holds details about calls to the Init method. + Init []struct { + } + // Shutdown holds details about calls to the Shutdown method. + Shutdown []struct { + } + } + lockGetCudaDriverVersion sync.RWMutex + lockGetDevices sync.RWMutex + lockGetDriverVersion sync.RWMutex + lockInit sync.RWMutex + lockShutdown sync.RWMutex +} + +// GetCudaDriverVersion calls GetCudaDriverVersionFunc. +func (mock *ManagerMock) GetCudaDriverVersion() (int, int, error) { + if mock.GetCudaDriverVersionFunc == nil { + panic("ManagerMock.GetCudaDriverVersionFunc: method is nil but Manager.GetCudaDriverVersion was just called") + } + callInfo := struct { + }{} + mock.lockGetCudaDriverVersion.Lock() + mock.calls.GetCudaDriverVersion = append(mock.calls.GetCudaDriverVersion, callInfo) + mock.lockGetCudaDriverVersion.Unlock() + return mock.GetCudaDriverVersionFunc() +} + +// GetCudaDriverVersionCalls gets all the calls that were made to GetCudaDriverVersion. +// Check the length with: +// +// len(mockedManager.GetCudaDriverVersionCalls()) +func (mock *ManagerMock) GetCudaDriverVersionCalls() []struct { +} { + var calls []struct { + } + mock.lockGetCudaDriverVersion.RLock() + calls = mock.calls.GetCudaDriverVersion + mock.lockGetCudaDriverVersion.RUnlock() + return calls +} + +// GetDevices calls GetDevicesFunc. +func (mock *ManagerMock) GetDevices() ([]Device, error) { + if mock.GetDevicesFunc == nil { + panic("ManagerMock.GetDevicesFunc: method is nil but Manager.GetDevices was just called") + } + callInfo := struct { + }{} + mock.lockGetDevices.Lock() + mock.calls.GetDevices = append(mock.calls.GetDevices, callInfo) + mock.lockGetDevices.Unlock() + return mock.GetDevicesFunc() +} + +// GetDevicesCalls gets all the calls that were made to GetDevices. +// Check the length with: +// +// len(mockedManager.GetDevicesCalls()) +func (mock *ManagerMock) GetDevicesCalls() []struct { +} { + var calls []struct { + } + mock.lockGetDevices.RLock() + calls = mock.calls.GetDevices + mock.lockGetDevices.RUnlock() + return calls +} + +// GetDriverVersion calls GetDriverVersionFunc. +func (mock *ManagerMock) GetDriverVersion() (string, error) { + if mock.GetDriverVersionFunc == nil { + panic("ManagerMock.GetDriverVersionFunc: method is nil but Manager.GetDriverVersion was just called") + } + callInfo := struct { + }{} + mock.lockGetDriverVersion.Lock() + mock.calls.GetDriverVersion = append(mock.calls.GetDriverVersion, callInfo) + mock.lockGetDriverVersion.Unlock() + return mock.GetDriverVersionFunc() +} + +// GetDriverVersionCalls gets all the calls that were made to GetDriverVersion. +// Check the length with: +// +// len(mockedManager.GetDriverVersionCalls()) +func (mock *ManagerMock) GetDriverVersionCalls() []struct { +} { + var calls []struct { + } + mock.lockGetDriverVersion.RLock() + calls = mock.calls.GetDriverVersion + mock.lockGetDriverVersion.RUnlock() + return calls +} + +// Init calls InitFunc. +func (mock *ManagerMock) Init() error { + if mock.InitFunc == nil { + panic("ManagerMock.InitFunc: method is nil but Manager.Init was just called") + } + callInfo := struct { + }{} + mock.lockInit.Lock() + mock.calls.Init = append(mock.calls.Init, callInfo) + mock.lockInit.Unlock() + return mock.InitFunc() +} + +// InitCalls gets all the calls that were made to Init. +// Check the length with: +// +// len(mockedManager.InitCalls()) +func (mock *ManagerMock) InitCalls() []struct { +} { + var calls []struct { + } + mock.lockInit.RLock() + calls = mock.calls.Init + mock.lockInit.RUnlock() + return calls +} + +// Shutdown calls ShutdownFunc. +func (mock *ManagerMock) Shutdown() error { + if mock.ShutdownFunc == nil { + panic("ManagerMock.ShutdownFunc: method is nil but Manager.Shutdown was just called") + } + callInfo := struct { + }{} + mock.lockShutdown.Lock() + mock.calls.Shutdown = append(mock.calls.Shutdown, callInfo) + mock.lockShutdown.Unlock() + return mock.ShutdownFunc() +} + +// ShutdownCalls gets all the calls that were made to Shutdown. +// Check the length with: +// +// len(mockedManager.ShutdownCalls()) +func (mock *ManagerMock) ShutdownCalls() []struct { +} { + var calls []struct { + } + mock.lockShutdown.RLock() + calls = mock.calls.Shutdown + mock.lockShutdown.RUnlock() + return calls +} diff --git a/pkg/nvidia-plugin/pkg/resource/null.go b/pkg/nvidia-plugin/pkg/resource/null.go new file mode 100644 index 000000000..0955fc43b --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/null.go @@ -0,0 +1,57 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" +) + +type null struct{} + +var _ Manager = (*null)(nil) + +// NewNullManager returns an instance of a CUDA-based library that can be used +// when no operations are required. +// This returns no devices and the Init and Shutdown methods are no-ops. +func NewNullManager() Manager { + return &null{} +} + +// Init is a no-op for the null manager +func (l *null) Init() error { + return nil +} + +// Shutdown is a no-op for the null manager +func (l *null) Shutdown() (err error) { + return nil +} + +// GetDevices returns a nil slice for the null manager +func (l *null) GetDevices() ([]Device, error) { + return nil, nil +} + +// GetCudaDriverVersion is not supported +func (l *null) GetCudaDriverVersion() (int, int, error) { + return 0, 0, fmt.Errorf("GetCudaDriverVersion is unsupported") +} + +// GetDriverVersion is not supported +func (l *null) GetDriverVersion() (string, error) { + return "", fmt.Errorf("GetDriverVersion is unsupported") +} diff --git a/pkg/nvidia-plugin/pkg/resource/nvml-device.go b/pkg/nvidia-plugin/pkg/resource/nvml-device.go new file mode 100644 index 000000000..9b29dc7bd --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/nvml-device.go @@ -0,0 +1,119 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvpci" + "github.com/NVIDIA/go-nvml/pkg/nvml" + + "github.com/google/uuid" +) + +type nvmlDevice struct { + device.Device + devicelib device.Interface +} + +var _ Device = (*nvmlDevice)(nil) + +// GetMigDevices returns the list of MIG devices configured on this device +func (d nvmlDevice) GetMigDevices() ([]Device, error) { + migs, err := d.Device.GetMigDevices() + if err != nil { + return nil, err + } + + var devices []Device + for _, m := range migs { + device := nvmlMigDevice{ + MigDevice: m, + devicelib: d.devicelib, + } + devices = append(devices, device) + } + + return devices, nil +} + +// GetCudaComputeCapability returns the CUDA major and minor versions. +func (d nvmlDevice) GetCudaComputeCapability() (int, int, error) { + major, minor, ret := d.Device.GetCudaComputeCapability() + if ret != nvml.SUCCESS { + return 0, 0, ret + } + + return major, minor, nil +} + +// GetAttributes is only supported for MIG devices. +func (d nvmlDevice) GetAttributes() (map[string]interface{}, error) { + return nil, fmt.Errorf("GetAttributes is not supported for non-MIG devices") +} + +// GetDeviceHandleFromMigDeviceHandle is only supported for MIG devices +func (d nvmlDevice) GetDeviceHandleFromMigDeviceHandle() (Device, error) { + return nil, fmt.Errorf("GetDeviceHandleFromMigDeviceHandle is not supported for non-MIG devices") +} + +// GetName returns the device name / model. +func (d nvmlDevice) GetName() (string, error) { + name, ret := d.Device.GetName() + if ret != nvml.SUCCESS { + return "", ret + } + return name, nil +} + +// GetTotalMemoryMB returns the total memory on a device in MB +func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) { + info, ret := d.Device.GetMemoryInfo() + if ret != nvml.SUCCESS { + return 0, ret + } + return info.Total / (1024 * 1024), nil +} + +func (d nvmlDevice) GetPCIClass() (uint32, error) { + pciBusID, err := d.GetPCIBusID() + if err != nil { + return 0, err + } + nvDevice, err := nvpci.New().GetGPUByPciBusID(pciBusID) + if err != nil { + return 0, err + } + return nvDevice.Class, nil +} + +func (d nvmlDevice) GetFabricIDs() (string, string, error) { + info, ret := d.GetGpuFabricInfo() + if ret != nvml.SUCCESS { + return "", "", fmt.Errorf("failed to get GPU fabric info: %w", ret) + } + + clusterUUID, err := uuid.FromBytes(info.ClusterUuid[:]) + if err != nil { + return "", "", fmt.Errorf("invalid cluster UUID: %w", err) + } + + cliqueId := fmt.Sprintf("%d", info.CliqueId) + + return clusterUUID.String(), cliqueId, nil +} diff --git a/pkg/nvidia-plugin/pkg/resource/nvml-lib.go b/pkg/nvidia-plugin/pkg/resource/nvml-lib.go new file mode 100644 index 000000000..ad1d97216 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/nvml-lib.go @@ -0,0 +1,94 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +type nvmlLib struct { + nvml.Interface + devicelib device.Interface +} + +// NewNVMLManager creates a new manager that uses NVML to query and manage devices +func NewNVMLManager(nvmllib nvml.Interface, devicelib device.Interface) Manager { + m := nvmlLib{ + Interface: nvmllib, + devicelib: devicelib, + } + return m +} + +// GetCudaDriverVersion : Return the cuda v using NVML +func (l nvmlLib) GetCudaDriverVersion() (int, int, error) { + v, ret := l.Interface.SystemGetCudaDriverVersion() + if ret != nvml.SUCCESS { + return 0, 0, ret + } + major := v / 1000 + minor := v % 1000 / 10 + + return major, minor, nil +} + +// GetDevices returns the NVML devices for the manager +func (l nvmlLib) GetDevices() ([]Device, error) { + libdevices, err := l.devicelib.GetDevices() + if err != nil { + return nil, err + } + + var devices []Device + for _, d := range libdevices { + device := nvmlDevice{ + Device: d, + devicelib: l.devicelib, + } + devices = append(devices, device) + } + + return devices, nil +} + +// GetDriverVersion returns the driver version +func (l nvmlLib) GetDriverVersion() (string, error) { + v, ret := l.Interface.SystemGetDriverVersion() + if ret != nvml.SUCCESS { + return "", ret + } + return v, nil +} + +// Init initialises the library +func (l nvmlLib) Init() error { + ret := l.Interface.Init() + if ret != nvml.SUCCESS { + return ret + } + return nil +} + +// Shutdown shuts down the library +func (l nvmlLib) Shutdown() error { + ret := l.Interface.Shutdown() + if ret != nvml.SUCCESS { + return ret + } + return nil +} diff --git a/pkg/nvidia-plugin/pkg/resource/nvml-mig-device.go b/pkg/nvidia-plugin/pkg/resource/nvml-mig-device.go new file mode 100644 index 000000000..cf3d05300 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/nvml-mig-device.go @@ -0,0 +1,152 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" + "strings" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvpci" + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +type nvmlMigDevice struct { + device.MigDevice + devicelib device.Interface +} + +var _ Device = (*nvmlMigDevice)(nil) + +// GetAttributes is only supported for MIG devices. +func (d nvmlMigDevice) GetAttributes() (map[string]interface{}, error) { + attributes, ret := d.MigDevice.GetAttributes() + if ret != nvml.SUCCESS { + return nil, ret + } + a := map[string]interface{}{ + "memory": attributes.MemorySizeMB, + "multiprocessors": attributes.MultiprocessorCount, + "slices.gi": attributes.GpuInstanceSliceCount, + "slices.ci": attributes.ComputeInstanceSliceCount, + "engines.copy": attributes.SharedCopyEngineCount, + "engines.decoder": attributes.SharedDecoderCount, + "engines.encoder": attributes.SharedEncoderCount, + "engines.jpeg": attributes.SharedJpegCount, + "engines.ofa": attributes.SharedOfaCount, + } + + return a, nil +} + +// GetDeviceHandleFromMigDeviceHandle is only supported for MIG devices +func (d nvmlMigDevice) GetDeviceHandleFromMigDeviceHandle() (Device, error) { + p, ret := d.MigDevice.GetDeviceHandleFromMigDeviceHandle() + if ret != nvml.SUCCESS { + return nil, ret + } + + device, err := d.devicelib.NewDevice(p) + if err != nil { + return nil, fmt.Errorf("failed to construct device: %v", err) + } + + parent := nvmlDevice{ + Device: device, + devicelib: d.devicelib, + } + return parent, nil +} + +// IsMigCapable is not supported for MIG devices +func (d nvmlMigDevice) IsMigCapable() (bool, error) { + return false, fmt.Errorf("IsMigCapable is not supported for MIG devices") +} + +// IsMigEnabled is not supported for MIG devices +func (d nvmlMigDevice) IsMigEnabled() (bool, error) { + return false, fmt.Errorf("IsMigEnabled is not supported for MIG devices") +} + +// GetMigDevices is not supported for MIG devices +func (d nvmlMigDevice) GetMigDevices() ([]Device, error) { + return nil, fmt.Errorf("GetMigDevices is not implemented for MIG devices") +} + +// GetCudaComputeCapability is not supported for MIG devices +func (d nvmlMigDevice) GetCudaComputeCapability() (int, int, error) { + return 0, 0, fmt.Errorf("GetCudaComputeCapability is not supported for MIG devices") +} + +// GetName returns the name of the nvmlMigDevice. +// This is equal to the mig profile. +func (d nvmlMigDevice) GetName() (string, error) { + p, err := d.MigDevice.GetProfile() + if err != nil { + return "", fmt.Errorf("failed to get MIG profile: %v", err) + } + + resourceName := strings.ReplaceAll(p.String(), "+", ".") + return resourceName, nil +} + +// GetTotalMemoryMB returns the total memory on a device in MB +func (d nvmlMigDevice) GetTotalMemoryMB() (uint64, error) { + attr, err := d.GetAttributes() + if err != nil { + return 0, err + } + + total, err := totalMemory(attr) + if err != nil { + return 0, err + } + return total, nil +} + +func totalMemory(attr map[string]interface{}) (uint64, error) { + totalMemory, ok := attr["memory"] + if !ok { + return 0, fmt.Errorf("no 'memory' attribute available") + } + + switch totalMemory := totalMemory.(type) { + case uint64: + return totalMemory, nil + case int: + if totalMemory < 0 { + return 0, fmt.Errorf("unexpected memory value %v", totalMemory) + } + //nolint:gosec // Here we are sure that the value will fit in memory and be positive. + return uint64(totalMemory), nil + default: + return 0, fmt.Errorf("unsupported attribute type %v", totalMemory) + } +} + +func (d nvmlMigDevice) GetPCIClass() (uint32, error) { + // GPU devices that support MIG do not support switching mode between graphics and compute, so they are always in compute mode. + return nvpci.PCI3dControllerClass, nil +} + +func (d nvmlMigDevice) IsFabricAttached() (bool, error) { + return false, fmt.Errorf("IsFabricAttached is not supported for MIG devices") +} + +func (d nvmlMigDevice) GetFabricIDs() (string, string, error) { + return "", "", fmt.Errorf("GetFabricIDs is not supported for MIG devices") +} diff --git a/pkg/nvidia-plugin/pkg/resource/sysfs-device.go b/pkg/nvidia-plugin/pkg/resource/sysfs-device.go new file mode 100644 index 000000000..a3097a108 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/sysfs-device.go @@ -0,0 +1,77 @@ +/** +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvpci" +) + +type vfioDevice struct { + nvidiaPCIDevice *nvpci.NvidiaPCIDevice +} + +// GetMigDevices returns the list of MIG devices configured on this device +func (d vfioDevice) GetMigDevices() ([]Device, error) { + return nil, nil +} + +// GetCudaComputeCapability is not supported for GPU devices with vfio pci driver. +func (d vfioDevice) GetCudaComputeCapability() (int, int, error) { + return -1, -1, nil +} + +// GetAttributes is only supported for MIG devices. +func (d vfioDevice) GetAttributes() (map[string]interface{}, error) { + return nil, fmt.Errorf("GetAttributes is not supported for non-MIG devices") +} + +// GetDeviceHandleFromMigDeviceHandle is only supported for MIG devices +func (d vfioDevice) GetDeviceHandleFromMigDeviceHandle() (Device, error) { + return nil, fmt.Errorf("GetDeviceHandleFromMigDeviceHandle is not supported for non-MIG devices") +} + +// GetName returns the device name / model. +func (d vfioDevice) GetName() (string, error) { + return d.nvidiaPCIDevice.DeviceName, nil +} + +// GetTotalMemoryMB returns the total memory on a device in MB +func (d vfioDevice) GetTotalMemoryMB() (uint64, error) { + _, val := d.nvidiaPCIDevice.Resources.GetTotalAddressableMemory(true) + return val, nil +} + +func (d vfioDevice) IsMigEnabled() (bool, error) { + return false, nil +} + +func (d vfioDevice) IsMigCapable() (bool, error) { + return false, nil +} + +func (d vfioDevice) GetPCIClass() (uint32, error) { + return d.nvidiaPCIDevice.Class, nil +} + +func (d vfioDevice) IsFabricAttached() (bool, error) { + return false, nil +} +func (d vfioDevice) GetFabricIDs() (string, string, error) { + return "", "", fmt.Errorf("GetFabricIDs is not supported for vfio devices") +} diff --git a/pkg/nvidia-plugin/pkg/resource/sysfs-lib.go b/pkg/nvidia-plugin/pkg/resource/sysfs-lib.go new file mode 100644 index 000000000..739d98351 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/sysfs-lib.go @@ -0,0 +1,74 @@ +/** +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvpci" + "k8s.io/klog/v2" +) + +type vfioLib struct { + nvpcilib nvpci.Interface +} + +// NewVfioManager returns an resource manger for devices with VFIO PCI driver +func NewVfioManager() Manager { + nvpcilib := nvpci.New() + manager := vfioLib{ + nvpcilib: nvpcilib, + } + return &manager +} + +// Init is a no-op for the vfio manager +func (l *vfioLib) Init() error { + return nil +} + +// Shutdown is a no-op for the vfio manager +func (l *vfioLib) Shutdown() (err error) { + return nil +} + +// GetDevices returns the devices with VFIO PCI driver available on the system +func (l *vfioLib) GetDevices() ([]Device, error) { + var devices []Device + nvdevices, err := l.nvpcilib.GetGPUs() + if err != nil { + return nil, err + } + + for _, dev := range nvdevices { + if dev.Driver == "vfio-pci" { + vfioDev := vfioDevice{dev} + devices = append(devices, vfioDev) + } else { + klog.Infof("Device not bound to 'vfio-pci'; device: %s driver: '%s'", dev.Address, dev.Driver) + } + } + return devices, nil +} + +// GetCudaDriverVersion is not supported +func (l *vfioLib) GetCudaDriverVersion() (int, int, error) { + return 0, 0, nil +} + +// GetDriverVersion is not supported +func (l *vfioLib) GetDriverVersion() (string, error) { + return "unknown.unknown.unknown", nil +} diff --git a/pkg/nvidia-plugin/pkg/resource/testing/resource-testing.go b/pkg/nvidia-plugin/pkg/resource/testing/resource-testing.go new file mode 100644 index 000000000..968183816 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/testing/resource-testing.go @@ -0,0 +1,141 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package testing + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/resource" +) + +// DeviceMock provides an alias that allows for additional functions to be defined. +type DeviceMock struct { + resource.DeviceMock +} + +// NewFullGPU creates a device that can be treated as a full GPU for testing +func NewFullGPU() resource.Device { + return NewDeviceMock(false) +} + +// NewMigEnabledDevice creates a GPU with MIG enabled and the specified MIG devices +func NewMigEnabledDevice(migs ...*resource.DeviceMock) resource.Device { + return NewDeviceMock(true).WithMigDevices(migs...) +} + +// NewDeviceMock creates a devices for testing which can have MIG enabled or disabled. +func NewDeviceMock(migEnabled bool) *DeviceMock { + d := DeviceMock{resource.DeviceMock{ + GetNameFunc: func() (string, error) { return "MOCKMODEL", nil }, + GetCudaComputeCapabilityFunc: func() (int, int, error) { + if migEnabled { + return 0, 0, nil + } + return 8, 0, nil + }, + GetTotalMemoryMBFunc: func() (uint64, error) { return uint64(300), nil }, + IsFabricAttachedFunc: func() (bool, error) { return false, nil }, + IsMigEnabledFunc: func() (bool, error) { return migEnabled, nil }, + IsMigCapableFunc: func() (bool, error) { return migEnabled, nil }, + GetMigDevicesFunc: func() ([]resource.Device, error) { return nil, nil }, + GetPCIClassFunc: func() (uint32, error) { return 0, nil }, + }} + return &d +} + +func NewDeviceWithPCIClassMock(pciClass uint32) *DeviceMock { + d := DeviceMock{resource.DeviceMock{ + GetPCIClassFunc: func() (uint32, error) { return pciClass, nil }, + }} + return &d +} + +// NewMigDevice creates a MIG devices with the specified attributes for testing +func NewMigDevice(gi int, ci int, gb uint64, attributes ...map[string]interface{}) *resource.DeviceMock { + + defaultAttributes := map[string]interface{}{ + "memory": gb, + "multiprocessors": 0, + "slices.gi": gi, + "slices.ci": ci, + "engines.copy": 0, + "engines.decoder": 0, + "engines.encoder": 0, + "engines.jpeg": 0, + "engines.ofa": 0, + } + for _, attr := range attributes { + for a, v := range attr { + defaultAttributes[a] = v + } + } + + return &resource.DeviceMock{ + GetNameFunc: func() (string, error) { return fmt.Sprintf("%dg.%dgb", gi, gb), nil }, + GetAttributesFunc: func() (map[string]interface{}, error) { return defaultAttributes, nil }, + } +} + +// WithMigDevices adds the specified MIG devices to the mocked device +func (d *DeviceMock) WithMigDevices(migs ...*resource.DeviceMock) *DeviceMock { + for _, m := range migs { + m.GetDeviceHandleFromMigDeviceHandleFunc = func() (resource.Device, error) { + return d, nil + } + } + d.GetMigDevicesFunc = func() ([]resource.Device, error) { + var devices []resource.Device + for _, m := range migs { + devices = append(devices, m) + } + return devices, nil + } + + return d +} + +// ManagerMock provides an alias that allows for additional functions to be defined. +type ManagerMock struct { + resource.ManagerMock +} + +// NewManagerMockWithDevices creates a mocked manager with the specified devices +func NewManagerMockWithDevices(devices ...resource.Device) *ManagerMock { + manager := ManagerMock{resource.ManagerMock{ + InitFunc: func() error { return nil }, + ShutdownFunc: func() error { return nil }, + GetDriverVersionFunc: func() (string, error) { + return "400.300", nil + }, + GetDevicesFunc: func() ([]resource.Device, error) { + return devices, nil + }, + GetCudaDriverVersionFunc: func() (int, int, error) { + return 8, 0, nil + }, + }} + return &manager +} + +// WithErrorOnInit sets the Init function for the ManagerMock to error if called. +func (m *ManagerMock) WithErrorOnInit(err error) *ManagerMock { + m.InitFunc = func() error { + fmt.Printf("returning error = %v", err) + return err + } + return m +} diff --git a/pkg/nvidia-plugin/pkg/resource/types.go b/pkg/nvidia-plugin/pkg/resource/types.go new file mode 100644 index 000000000..dc6fa6a77 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/resource/types.go @@ -0,0 +1,45 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package resource + +// Manager defines an interface for managing devices +// +//go:generate moq -rm -out manager_mock.go . Manager +type Manager interface { + Init() error + Shutdown() error + GetDevices() ([]Device, error) + GetDriverVersion() (string, error) + GetCudaDriverVersion() (int, int, error) +} + +// Device defines an interface for a device with which labels are associated +// +//go:generate moq -out device_mock.go . Device +type Device interface { + IsFabricAttached() (bool, error) + IsMigEnabled() (bool, error) + IsMigCapable() (bool, error) + GetMigDevices() ([]Device, error) + GetAttributes() (map[string]interface{}, error) + GetName() (string, error) + GetTotalMemoryMB() (uint64, error) + GetDeviceHandleFromMigDeviceHandle() (Device, error) + GetCudaComputeCapability() (int, int, error) + GetPCIClass() (uint32, error) + GetFabricIDs() (string, string, error) +} diff --git a/pkg/nvidia-plugin/pkg/rm/allocate.go b/pkg/nvidia-plugin/pkg/rm/allocate.go new file mode 100644 index 000000000..166b68e84 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/allocate.go @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "fmt" + "sort" +) + +// distributedAlloc returns a list of devices such that any replicated +// devices are distributed across all replicated GPUs equally. It takes into +// account already allocated replicas to ensure a proper balance across them. +func (r *resourceManager) distributedAlloc(available, required []string, size int) ([]string, error) { + // Get the set of candidate devices as the difference between available and required. + candidates := r.devices.Subset(available).Difference(r.devices.Subset(required)).GetIDs() + needed := size - len(required) + + if len(candidates) < needed { + return nil, fmt.Errorf("not enough available devices to satisfy allocation") + } + + // For each candidate device, build a mapping of (stripped) device ID to + // total / available replicas for that device. + replicas := make(map[string]*struct{ total, available int }) + for _, c := range candidates { + id := AnnotatedID(c).GetID() + if _, exists := replicas[id]; !exists { + replicas[id] = &struct{ total, available int }{} + } + replicas[id].available++ + } + for d := range r.devices { + id := AnnotatedID(d).GetID() + if _, exists := replicas[id]; !exists { + continue + } + replicas[id].total++ + } + + // Grab the set of 'needed' devices one-by-one from the candidates list. + // Before selecting each candidate, first sort the candidate list using the + // replicas map above. After sorting, the first element in the list will + // contain the device with the least difference between total and available + // replications (based on what's already been allocated). Add this device + // to the list of devices to allocate, remove it from the candidate list, + // down its available count in the replicas map, and repeat. + var devices []string + for i := 0; i < needed; i++ { + sort.Slice(candidates, func(i, j int) bool { + iid := AnnotatedID(candidates[i]).GetID() + jid := AnnotatedID(candidates[j]).GetID() + idiff := replicas[iid].total - replicas[iid].available + jdiff := replicas[jid].total - replicas[jid].available + return idiff < jdiff + }) + id := AnnotatedID(candidates[0]).GetID() + replicas[id].available-- + devices = append(devices, candidates[0]) + candidates = candidates[1:] + } + + // Add the set of required devices to this list and return it. + devices = append(required, devices...) + + return devices, nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map.go b/pkg/nvidia-plugin/pkg/rm/device_map.go similarity index 73% rename from pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map.go rename to pkg/nvidia-plugin/pkg/rm/device_map.go index 4b6a43c6e..1dbc439bd 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map.go +++ b/pkg/nvidia-plugin/pkg/rm/device_map.go @@ -1,61 +1,59 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ package rm import ( "fmt" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvlib/pkg/nvml" - spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" ) type deviceMapBuilder struct { device.Interface - config *nvidia.DeviceConfig + migStrategy *string + resources *spec.Resources + replicatedResources *spec.ReplicatedResources + + newGPUDevice func(i int, gpu nvml.Device) (string, deviceInfo) } // DeviceMap stores a set of devices per resource name. type DeviceMap map[spec.ResourceName]Devices // NewDeviceMap creates a device map for the specified NVML library and config. -func NewDeviceMap(nvmllib nvml.Interface, config *nvidia.DeviceConfig) (DeviceMap, error) { +func NewDeviceMap(infolib info.Interface, devicelib device.Interface, config *nvidia.DeviceConfig) (DeviceMap, error) { b := deviceMapBuilder{ - Interface: device.New(device.WithNvml(nvmllib)), - config: config, + Interface: devicelib, + migStrategy: config.Flags.MigStrategy, + resources: &config.Resources, + replicatedResources: config.Sharing.ReplicatedResources(), + newGPUDevice: newNvmlGPUDevice, } + + if infolib.ResolvePlatform() == info.PlatformWSL { + b.newGPUDevice = newWslGPUDevice + } + return b.build() } @@ -65,9 +63,9 @@ func (b *deviceMapBuilder) build() (DeviceMap, error) { if err != nil { return nil, fmt.Errorf("error building device map from config.resources: %v", err) } - devices, err = updateDeviceMapWithReplicas(b.config, devices) + devices, err = updateDeviceMapWithReplicas(b.replicatedResources, devices) if err != nil { - return nil, fmt.Errorf("error updating device map with replicas from config.sharing.timeSlicing.resources: %v", err) + return nil, fmt.Errorf("error updating device map with replicas from replicatedResources config: %v", err) } return devices, nil } @@ -79,7 +77,7 @@ func (b *deviceMapBuilder) buildDeviceMapFromConfigResources() (DeviceMap, error return nil, fmt.Errorf("error building GPU device map: %v", err) } - if *b.config.Flags.MigStrategy == spec.MigStrategyNone { + if *b.migStrategy == spec.MigStrategyNone { return deviceMap, nil } @@ -89,7 +87,7 @@ func (b *deviceMapBuilder) buildDeviceMapFromConfigResources() (DeviceMap, error } var requireUniformMIGDevices bool - if *b.config.Flags.MigStrategy == spec.MigStrategySingle { + if *b.migStrategy == spec.MigStrategySingle { requireUniformMIGDevices = true } @@ -111,7 +109,7 @@ func (b *deviceMapBuilder) buildDeviceMapFromConfigResources() (DeviceMap, error func (b *deviceMapBuilder) buildGPUDeviceMap() (DeviceMap, error) { devices := make(DeviceMap) - b.VisitDevices(func(i int, gpu device.Device) error { + err := b.VisitDevices(func(i int, gpu device.Device) error { name, ret := gpu.GetName() if ret != nvml.SUCCESS { return fmt.Errorf("error getting product name for GPU: %v", ret) @@ -120,18 +118,18 @@ func (b *deviceMapBuilder) buildGPUDeviceMap() (DeviceMap, error) { if err != nil { return fmt.Errorf("error checking if MIG is enabled on GPU: %v", err) } - if migEnabled && *b.config.Flags.MigStrategy != spec.MigStrategyNone { + if migEnabled && *b.migStrategy != spec.MigStrategyNone { return nil } - for _, resource := range b.config.Resources.GPUs { + for _, resource := range b.resources.GPUs { if resource.Pattern.Matches(name) { - index, info := newGPUDevice(i, gpu) + index, info := b.newGPUDevice(i, gpu) return devices.setEntry(resource.Name, index, info) } } return fmt.Errorf("GPU name '%v' does not match any resource patterns", name) }) - return devices, nil + return devices, err } // buildMigDeviceMap builds a map of resource names to MIG devices @@ -142,7 +140,7 @@ func (b *deviceMapBuilder) buildMigDeviceMap() (DeviceMap, error) { if err != nil { return fmt.Errorf("error getting MIG profile for MIG device at index '(%v, %v)': %v", i, j, err) } - for _, resource := range b.config.Resources.MIGs { + for _, resource := range b.resources.MIGs { if resource.Pattern.Matches(migProfile.String()) { index, info := newMigDevice(i, j, mig) return devices.setEntry(resource.Name, index, info) @@ -168,9 +166,11 @@ func (b *deviceMapBuilder) assertAllMigDevicesAreValid(uniform bool) error { if err != nil { return err } - if len(migDevices) == 0 { - i := 0 - return fmt.Errorf("device %v has an invalid MIG configuration", i) + if uniform && len(migDevices) == 0 { + return fmt.Errorf("device %v has no MIG devices configured", i) + } + if !uniform && len(migDevices) == 0 { + klog.Warningf("device %v has no MIG devices configured", i) } return nil }) @@ -198,9 +198,9 @@ func (b *deviceMapBuilder) assertAllMigDevicesAreValid(uniform bool) error { }) } -// setEntry sets the DeviceMap entry for the specified resource. -func (d DeviceMap) setEntry(name spec.ResourceName, index string, info deviceInfo) error { - dev, err := BuildDevice(index, info) +// setEntry sets the DeviceMap entry for the specified resource +func (d DeviceMap) setEntry(name spec.ResourceName, index string, device deviceInfo) error { + dev, err := BuildDevice(index, device) if err != nil { return fmt.Errorf("error building Device: %v", err) } @@ -280,13 +280,14 @@ func (d DeviceMap) getIDsOfDevicesToReplicate(r *spec.ReplicatedResource) ([]str return nil, fmt.Errorf("unexpected error") } -// updateDeviceMapWithReplicas returns an updated map of resource names to devices with replica information from spec.Config.Sharing.TimeSlicing.Resources -func updateDeviceMapWithReplicas(config *nvidia.DeviceConfig, oDevices DeviceMap) (DeviceMap, error) { +// updateDeviceMapWithReplicas returns an updated map of resource names to devices with replica +// information from the active replicated resources config. +func updateDeviceMapWithReplicas(replicatedResources *spec.ReplicatedResources, oDevices DeviceMap) (DeviceMap, error) { devices := make(DeviceMap) - // Begin by walking config.Sharing.TimeSlicing.Resources and building a map of just the resource names. + // Begin by walking replicatedResources.Resources and building a map of just the resource names. names := make(map[spec.ResourceName]bool) - for _, r := range config.Sharing.TimeSlicing.Resources { + for _, r := range replicatedResources.Resources { names[r.Name] = true } @@ -297,8 +298,9 @@ func updateDeviceMapWithReplicas(config *nvidia.DeviceConfig, oDevices DeviceMap } } - // Walk TimeSlicing.Resources and update devices in the device map as appropriate. - for _, r := range config.Sharing.TimeSlicing.Resources { + // Walk shared Resources and update devices in the device map as appropriate. + for _, resource := range replicatedResources.Resources { + r := resource // Get the IDs of the devices we want to replicate from oDevices ids, err := oDevices.getIDsOfDevicesToReplicate(&r) if err != nil { @@ -325,6 +327,7 @@ func updateDeviceMapWithReplicas(config *nvidia.DeviceConfig, oDevices DeviceMap annotatedID := string(NewAnnotatedID(id, i)) replicatedDevice := *(oDevices[r.Name][id]) replicatedDevice.ID = annotatedID + replicatedDevice.Replicas = r.Replicas devices.insert(name, &replicatedDevice) } } diff --git a/pkg/nvidia-plugin/pkg/rm/device_map_test.go b/pkg/nvidia-plugin/pkg/rm/device_map_test.go new file mode 100644 index 000000000..61b0056b2 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/device_map_test.go @@ -0,0 +1,109 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "testing" + + "github.com/stretchr/testify/require" + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +func TestDeviceMapInsert(t *testing.T) { + device0 := Device{Device: pluginapi.Device{ID: "0"}} + device0withIndex := Device{Device: pluginapi.Device{ID: "0"}, Index: "index"} + device1 := Device{Device: pluginapi.Device{ID: "1"}} + + testCases := []struct { + description string + deviceMap DeviceMap + key string + value *Device + expectedDeviceMap DeviceMap + }{ + { + description: "insert into empty map", + deviceMap: make(DeviceMap), + key: "resource", + value: &device0, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + }, + { + description: "add to existing resource", + deviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + key: "resource", + value: &device1, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + "1": &device1, + }, + }, + }, + { + description: "add new resource", + deviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + key: "resource1", + value: &device0, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + "resource1": Devices{ + "0": &device0, + }, + }, + }, + { + description: "overwrite existing device", + deviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + key: "resource", + value: &device0withIndex, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0withIndex, + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + tc.deviceMap.insert(spec.ResourceName(tc.key), tc.value) + + require.EqualValues(t, tc.expectedDeviceMap, tc.deviceMap) + }) + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/devices.go b/pkg/nvidia-plugin/pkg/rm/devices.go similarity index 71% rename from pkg/device-plugin/nvidiadevice/nvinternal/rm/devices.go rename to pkg/nvidia-plugin/pkg/rm/devices.go index 668404455..f3b77c5fb 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/devices.go +++ b/pkg/nvidia-plugin/pkg/rm/devices.go @@ -1,33 +1,17 @@ /* - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package rm @@ -37,14 +21,19 @@ import ( "strconv" "strings" - kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) -// Device wraps kubeletdevicepluginv1beta1.Device with extra metadata and functions. +// Device wraps pluginapi.Device with extra metadata and functions. type Device struct { - kubeletdevicepluginv1beta1.Device - Paths []string - Index string + pluginapi.Device + Paths []string + Index string + TotalMemory uint64 + ComputeCapability string + // Replicas stores the total number of times this device is replicated. + // If this is 0 or 1 then the device is not shared. + Replicas int } // deviceInfo defines the information the required to construct a Device @@ -52,6 +41,8 @@ type deviceInfo interface { GetUUID() (string, error) GetPaths() ([]string, error) GetNumaNode() (bool, int, error) + GetTotalMemory() (uint64, error) + GetComputeCapability() (string, error) } // Devices wraps a map[string]*Device with some functions. @@ -80,14 +71,27 @@ func BuildDevice(index string, d deviceInfo) (*Device, error) { return nil, fmt.Errorf("error getting device NUMA node: %v", err) } - dev := Device{} + totalMemory, err := d.GetTotalMemory() + if err != nil { + return nil, fmt.Errorf("error getting device memory: %w", err) + } + + computeCapability, err := d.GetComputeCapability() + if err != nil { + return nil, fmt.Errorf("error getting device compute capability: %w", err) + } + + dev := Device{ + TotalMemory: totalMemory, + ComputeCapability: computeCapability, + } dev.ID = uuid dev.Index = index dev.Paths = paths - dev.Health = kubeletdevicepluginv1beta1.Healthy + dev.Health = pluginapi.Healthy if hasNuma { - dev.Topology = &kubeletdevicepluginv1beta1.TopologyInfo{ - Nodes: []*kubeletdevicepluginv1beta1.NUMANode{ + dev.Topology = &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ { ID: int64(numa), }, @@ -155,28 +159,28 @@ func (ds Devices) GetIDs() []string { return res } -// GetPluginDevices returns the plugin Devices from all devices in the Devices -func (ds Devices) GetPluginDevices(count uint) []*kubeletdevicepluginv1beta1.Device { - var res []*kubeletdevicepluginv1beta1.Device - - if !strings.Contains(ds.GetIDs()[0], "MIG") { - for _, dev := range ds { - for i := uint(0); i < count; i++ { - id := fmt.Sprintf("%v-%v", dev.ID, i) - res = append(res, &kubeletdevicepluginv1beta1.Device{ - ID: id, - Health: dev.Health, - Topology: nil, - }) - } - } - } else { - for _, d := range ds { - res = append(res, &d.Device) +// GetUUIDs returns the uuids associated with the Device in the set. +func (ds Devices) GetUUIDs() []string { + var res []string + seen := make(map[string]bool) + for _, d := range ds { + uuid := d.GetUUID() + if seen[uuid] { + continue } - + seen[uuid] = true + res = append(res, uuid) } + return res +} +// GetPluginDevices returns the plugin Devices from all devices in the Devices +func (ds Devices) GetPluginDevices() []*pluginapi.Device { + var res []*pluginapi.Device + for _, device := range ds { + d := device + res = append(res, &d.Device) + } return res } @@ -198,7 +202,7 @@ func (ds Devices) GetPaths() []string { return res } -// AlignedAllocationSupported checks whether all devices support an alligned allocation +// AlignedAllocationSupported checks whether all devices support an aligned allocation func (ds Devices) AlignedAllocationSupported() bool { for _, d := range ds { if !d.AlignedAllocationSupported() { @@ -208,7 +212,7 @@ func (ds Devices) AlignedAllocationSupported() bool { return true } -// AlignedAllocationSupported checks whether the device supports an alligned allocation +// AlignedAllocationSupported checks whether the device supports an aligned allocation func (d Device) AlignedAllocationSupported() bool { if d.IsMigDevice() { return false @@ -241,10 +245,7 @@ func NewAnnotatedID(id string, replica int) AnnotatedID { // HasAnnotations checks if an AnnotatedID has any annotations or not. func (r AnnotatedID) HasAnnotations() bool { split := strings.SplitN(string(r), "::", 2) - if len(split) != 2 { - return false - } - return true + return len(split) == 2 } // Split splits a AnnotatedID into its ID and replica number parts. diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/health.go b/pkg/nvidia-plugin/pkg/rm/health.go similarity index 81% rename from pkg/device-plugin/nvidiadevice/nvinternal/rm/health.go rename to pkg/nvidia-plugin/pkg/rm/health.go index 8dce3cc07..3a308ff3e 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/health.go +++ b/pkg/nvidia-plugin/pkg/rm/health.go @@ -1,33 +1,17 @@ /* - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package rm @@ -38,7 +22,7 @@ import ( "strconv" "strings" - "github.com/NVIDIA/go-nvlib/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml" "k8s.io/klog/v2" ) @@ -49,9 +33,6 @@ const ( // this is in addition to the Application errors that are already ignored. envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS" allHealthChecks = "xids" - - // maxSuccessiveEventErrorCount sets the number of errors waiting for events before marking all devices as unhealthy. - maxSuccessiveEventErrorCount = 3 ) // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices @@ -102,11 +83,13 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic if ret != nvml.SUCCESS { return fmt.Errorf("failed to create event set: %v", ret) } - defer eventSet.Free() + defer func() { + _ = eventSet.Free() + }() parentToDeviceMap := make(map[string]*Device) - deviceIDToGiMap := make(map[string]int) - deviceIDToCiMap := make(map[string]int) + deviceIDToGiMap := make(map[string]uint32) + deviceIDToCiMap := make(map[string]uint32) eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) for _, d := range devices { @@ -129,7 +112,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic supportedEvents, ret := gpu.GetSupportedEventTypes() if ret != nvml.SUCCESS { - klog.Infof("Unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret) + klog.Infof("unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret) unhealthy <- d continue } @@ -193,7 +176,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF { gi := deviceIDToGiMap[d.ID] ci := deviceIDToCiMap[d.ID] - if !(uint32(gi) == e.GpuInstanceId && uint32(ci) == e.ComputeInstanceId) { + if !(gi == e.GpuInstanceId && ci == e.ComputeInstanceId) { continue } klog.Infof("Event for mig device %v (gi=%v, ci=%v)", d.ID, gi, ci) @@ -232,7 +215,7 @@ func getAdditionalXids(input string) []uint64 { // getDevicePlacement returns the placement of the specified device. // For a MIG device the placement is defined by the 3-tuple // For a full device the returned 3-tuple is the device's uuid and 0xFFFFFFFF for the other two elements. -func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, int, int, error) { +func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, uint32, uint32, error) { if !d.IsMigDevice() { return d.GetUUID(), 0xFFFFFFFF, 0xFFFFFFFF, nil } @@ -240,7 +223,7 @@ func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, int, int, e } // getMigDeviceParts returns the parent GI and CI ids of the MIG device. -func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, error) { +func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, uint32, uint32, error) { if !d.IsMigDevice() { return "", 0, 0, fmt.Errorf("cannot get GI and CI of full device") } @@ -267,13 +250,14 @@ func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, er if ret != nvml.SUCCESS { return "", 0, 0, fmt.Errorf("failed to get Compute Instance ID: %v", ret) } - return parentUUID, gi, ci, nil + //nolint:gosec // We know that the values returned from Get*InstanceId are within the valid uint32 range. + return parentUUID, uint32(gi), uint32(ci), nil } return parseMigDeviceUUID(uuid) } // parseMigDeviceUUID splits the MIG device UUID into the parent device UUID and ci and gi -func parseMigDeviceUUID(mig string) (string, int, int, error) { +func parseMigDeviceUUID(mig string) (string, uint32, uint32, error) { tokens := strings.SplitN(mig, "-", 2) if len(tokens) != 2 || tokens[0] != "MIG" { return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") @@ -284,15 +268,24 @@ func parseMigDeviceUUID(mig string) (string, int, int, error) { return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") } - gi, err := strconv.ParseInt(tokens[1], 10, 32) + gi, err := toUint32(tokens[1]) if err != nil { return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") } - ci, err := strconv.ParseInt(tokens[2], 10, 32) + ci, err := toUint32(tokens[2]) if err != nil { return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") } - return tokens[0], int(gi), int(ci), nil + return tokens[0], gi, ci, nil +} + +func toUint32(s string) (uint32, error) { + u, err := strconv.ParseUint(s, 10, 32) + if err != nil { + return 0, err + } + //nolint:gosec // Since we parse s with a 32-bit size this will not overflow. + return uint32(u), nil } diff --git a/pkg/nvidia-plugin/pkg/rm/health_test.go b/pkg/nvidia-plugin/pkg/rm/health_test.go new file mode 100644 index 000000000..101aadf78 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/health_test.go @@ -0,0 +1,74 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGetAdditionalXids(t *testing.T) { + testCases := []struct { + input string + expected []uint64 + }{ + {}, + { + input: ",", + }, + { + input: "not-an-int", + }, + { + input: "68", + expected: []uint64{68}, + }, + { + input: "-68", + }, + { + input: "68 ", + expected: []uint64{68}, + }, + { + input: "68,", + expected: []uint64{68}, + }, + { + input: ",68", + expected: []uint64{68}, + }, + { + input: "68,67", + expected: []uint64{68, 67}, + }, + { + input: "68,not-an-int,67", + expected: []uint64{68, 67}, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + xids := getAdditionalXids(tc.input) + + require.EqualValues(t, tc.expected, xids) + }) + } +} diff --git a/pkg/nvidia-plugin/pkg/rm/helper.go b/pkg/nvidia-plugin/pkg/rm/helper.go new file mode 100644 index 000000000..580282c59 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/helper.go @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +// int8Slice wraps an []int8 with more functions. +type int8Slice []int8 + +// String turns a nil terminated int8Slice into a string +func (s int8Slice) String() string { + var b []byte + for _, c := range s { + if c == 0 { + break + } + b = append(b, byte(c)) + } + return string(b) +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices.go b/pkg/nvidia-plugin/pkg/rm/nvml_devices.go similarity index 65% rename from pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices.go rename to pkg/nvidia-plugin/pkg/rm/nvml_devices.go index fe9375b8c..a8c642f0b 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices.go +++ b/pkg/nvidia-plugin/pkg/rm/nvml_devices.go @@ -1,33 +1,17 @@ /* - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package rm @@ -39,10 +23,9 @@ import ( "strconv" "strings" - "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/mig" + "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - "github.com/NVIDIA/go-nvlib/pkg/nvml" + "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/mig" ) const ( @@ -61,16 +44,16 @@ type nvmlMigDevice nvmlDevice var _ deviceInfo = (*nvmlDevice)(nil) var _ deviceInfo = (*nvmlMigDevice)(nil) -func newGPUDevice(i int, gpu nvml.Device) (string, deviceInfo) { +func newNvmlGPUDevice(i int, gpu nvml.Device) (string, deviceInfo) { index := fmt.Sprintf("%v", i) - isWsl, _ := info.New().HasDXCore() - if isWsl { - return index, wslDevice{gpu} - } - return index, nvmlDevice{gpu} } +func newWslGPUDevice(i int, gpu nvml.Device) (string, deviceInfo) { + index := fmt.Sprintf("%v", i) + return index, wslDevice{gpu} +} + func newMigDevice(i int, j int, mig nvml.Device) (string, nvmlMigDevice) { return fmt.Sprintf("%v:%v", i, j), nvmlMigDevice{mig} } @@ -100,6 +83,24 @@ func (d nvmlDevice) GetPaths() ([]string, error) { return []string{path}, nil } +// GetComputeCapability returns the CUDA Compute Capability for the device. +func (d nvmlDevice) GetComputeCapability() (string, error) { + major, minor, ret := d.Device.GetCudaComputeCapability() + if ret != nvml.SUCCESS { + return "", ret + } + return fmt.Sprintf("%d.%d", major, minor), nil +} + +// GetComputeCapability returns the CUDA Compute Capability for the device. +func (d nvmlMigDevice) GetComputeCapability() (string, error) { + parent, ret := d.Device.GetDeviceHandleFromMigDeviceHandle() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("failed to get parent device: %w", ret) + } + return nvmlDevice{parent}.GetComputeCapability() +} + // GetPaths returns the paths for a MIG device func (d nvmlMigDevice) GetPaths() ([]string, error) { capDevicePaths, err := mig.GetMigCapabilityDevicePaths() @@ -148,13 +149,13 @@ func (d nvmlMigDevice) GetPaths() ([]string, error) { // GetNumaNode returns the NUMA node associated with the GPU device func (d nvmlDevice) GetNumaNode() (bool, int, error) { - pciInfo, ret := d.GetPciInfo() + info, ret := d.GetPciInfo() if ret != nvml.SUCCESS { return false, 0, fmt.Errorf("error getting PCI Bus Info of device: %v", ret) } // Discard leading zeros. - busID := strings.ToLower(strings.TrimPrefix(int8Slice(pciInfo.BusId[:]).String(), "0000")) + busID := strings.ToLower(strings.TrimPrefix(int8Slice(info.BusId[:]).String(), "0000")) b, err := os.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", busID)) if err != nil { @@ -182,3 +183,21 @@ func (d nvmlMigDevice) GetNumaNode() (bool, int, error) { return nvmlDevice{parent}.GetNumaNode() } + +// GetTotalMemory returns the total memory available on the device. +func (d nvmlDevice) GetTotalMemory() (uint64, error) { + info, ret := d.Device.GetMemoryInfo() + if ret != nvml.SUCCESS { + return 0, ret + } + return info.Total, nil +} + +// GetTotalMemory returns the total memory available on the device. +func (d nvmlMigDevice) GetTotalMemory() (uint64, error) { + info, ret := d.Device.GetMemoryInfo() + if ret != nvml.SUCCESS { + return 0, ret + } + return info.Total, nil +} diff --git a/pkg/nvidia-plugin/pkg/rm/nvml_manager.go b/pkg/nvidia-plugin/pkg/rm/nvml_manager.go new file mode 100644 index 000000000..e071620a2 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/nvml_manager.go @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "fmt" + + "github.com/NVIDIA/go-gpuallocator/gpuallocator" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "k8s.io/klog/v2" +) + +type nvmlResourceManager struct { + resourceManager + nvml nvml.Interface +} + +var _ ResourceManager = (*nvmlResourceManager)(nil) + +// NewNVMLResourceManagers returns a set of ResourceManagers, one for each NVML resource in 'config'. +func NewNVMLResourceManagers(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, config *nvidia.DeviceConfig) ([]ResourceManager, error) { + ret := nvmllib.Init() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to initialize NVML: %v", ret) + } + defer func() { + ret := nvmllib.Shutdown() + if ret != nvml.SUCCESS { + klog.Infof("Error shutting down NVML: %v", ret) + } + }() + + deviceMap, err := NewDeviceMap(infolib, devicelib, config) + if err != nil { + return nil, fmt.Errorf("error building device map: %v", err) + } + + var rms []ResourceManager + for resourceName, devices := range deviceMap { + if len(devices) == 0 { + continue + } + r := &nvmlResourceManager{ + resourceManager: resourceManager{ + config: config, + resource: resourceName, + devices: devices, + }, + nvml: nvmllib, + } + rms = append(rms, r) + } + + return rms, nil +} + +// GetPreferredAllocation runs an allocation algorithm over the inputs. +// The algorithm chosen is based both on the incoming set of available devices and various config settings. +func (r *nvmlResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) { + return r.getPreferredAllocation(available, required, size) +} + +// GetDevicePaths returns the required and optional device nodes for the requested resources +func (r *nvmlResourceManager) GetDevicePaths(ids []string) []string { + paths := []string{ + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", + } + + return append(paths, r.Devices().Subset(ids).GetPaths()...) +} + +// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices +func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { + return r.checkHealth(stop, r.devices, unhealthy) +} + +// getPreferredAllocation runs an allocation algorithm over the inputs. +// The algorithm chosen is based both on the incoming set of available devices and various config settings. +func (r *nvmlResourceManager) getPreferredAllocation(available, required []string, size int) ([]string, error) { + // If all of the available devices are full GPUs without replicas, then + // calculate an aligned allocation across those devices. + if r.Devices().AlignedAllocationSupported() && !AnnotatedIDs(available).AnyHasAnnotations() { + return r.alignedAlloc(available, required, size) + } + + // Otherwise, distribute them evenly across all replicated GPUs + return r.distributedAlloc(available, required, size) +} + +// alignedAlloc shells out to the alignedAllocationPolicy that is set in +// order to calculate the preferred allocation. +func (r *nvmlResourceManager) alignedAlloc(available, required []string, size int) ([]string, error) { + var devices []string + + linkedDevices, err := gpuallocator.NewDevices( + gpuallocator.WithNvmlLib(r.nvml), + ) + if err != nil { + return nil, fmt.Errorf("unable to get device link information: %w", err) + } + + availableDevices, err := linkedDevices.Filter(available) + if err != nil { + return nil, fmt.Errorf("unable to retrieve list of available devices: %v", err) + } + + requiredDevices, err := linkedDevices.Filter(required) + if err != nil { + return nil, fmt.Errorf("unable to retrieve list of required devices: %v", err) + } + + allocatedDevices := gpuallocator.NewBestEffortPolicy().Allocate(availableDevices, requiredDevices, size) + for _, device := range allocatedDevices { + devices = append(devices, device.UUID) + } + + return devices, nil +} diff --git a/pkg/nvidia-plugin/pkg/rm/rm.go b/pkg/nvidia-plugin/pkg/rm/rm.go new file mode 100644 index 000000000..5267b60b1 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/rm.go @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "errors" + "fmt" + "strings" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +// resourceManager forms the base type for specific resource manager implementations +type resourceManager struct { + config *nvidia.DeviceConfig + resource spec.ResourceName + devices Devices +} + +// ResourceManager provides an interface for listing a set of Devices and checking health on them +type ResourceManager interface { + Resource() spec.ResourceName + Devices() Devices + GetDevicePaths([]string) []string + GetPreferredAllocation(available, required []string, size int) ([]string, error) + CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error + ValidateRequest(AnnotatedIDs) error +} + +// Resource gets the resource name associated with the ResourceManager +func (r *resourceManager) Resource() spec.ResourceName { + return r.resource +} + +// Devices gets the devices managed by the ResourceManager +func (r *resourceManager) Devices() Devices { + return r.devices +} + +var errInvalidRequest = errors.New("invalid request") + +// ValidateRequest checks the requested IDs against the resource manager configuration. +// It asserts that all requested IDs are known to the resource manager and that the request is +// valid for a specified sharing configuration. +func (r *resourceManager) ValidateRequest(ids AnnotatedIDs) error { + // Assert that all requested IDs are known to the resource manager + for _, id := range ids { + if !r.devices.Contains(id) { + return fmt.Errorf("%w: unknown device: %s", errInvalidRequest, id) + } + } + + // If the devices being allocated are replicas, then (conditionally) + // error out if more than one resource is being allocated. + includesReplicas := ids.AnyHasAnnotations() + numRequestedDevices := len(ids) + switch r.config.Sharing.SharingStrategy() { + case spec.SharingStrategyTimeSlicing: + if includesReplicas && numRequestedDevices > 1 && r.config.Sharing.ReplicatedResources().FailRequestsGreaterThanOne { + return fmt.Errorf("%w: maximum request size for shared resources is 1; found %d", errInvalidRequest, numRequestedDevices) + } + case spec.SharingStrategyMPS: + // For MPS sharing, we explicitly ignore the FailRequestsGreaterThanOne + // value in the sharing settings. + // This setting was added to timeslicing after the initial release and + // is set to `false` to maintain backward compatibility with existing + // deployments. If we do extend MPS to allow multiple devices to be + // requested, the MPS API will be extended separately from the + // time-slicing API. + if includesReplicas && numRequestedDevices > 1 { + return fmt.Errorf("%w: maximum request size for shared resources is 1; found %d", errInvalidRequest, numRequestedDevices) + } + } + return nil +} + +// AddDefaultResourcesToConfig adds default resource matching rules to config.Resources +func AddDefaultResourcesToConfig(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, config *spec.Config) error { + _ = config.Resources.AddGPUResource("*", "gpu") + if config.Flags.MigStrategy == nil { + return nil + } + switch *config.Flags.MigStrategy { + case spec.MigStrategySingle: + return config.Resources.AddMIGResource("*", "gpu") + case spec.MigStrategyMixed: + hasNVML, reason := infolib.HasNvml() + if !hasNVML { + klog.Warningf("mig-strategy=%q is only supported with NVML", spec.MigStrategyMixed) + klog.Warningf("NVML not detected: %v", reason) + return nil + } + + ret := nvmllib.Init() + if ret != nvml.SUCCESS { + if *config.Flags.FailOnInitError { + return fmt.Errorf("failed to initialize NVML: %v", ret) + } + return nil + } + defer func() { + ret := nvmllib.Shutdown() + if ret != nvml.SUCCESS { + klog.Errorf("Error shutting down NVML: %v", ret) + } + }() + + return devicelib.VisitMigProfiles(func(p device.MigProfile) error { + info := p.GetInfo() + if info.C != info.G { + return nil + } + resourceName := strings.ReplaceAll("mig-"+p.String(), "+", ".") + return config.Resources.AddMIGResource(p.String(), resourceName) + }) + } + return nil +} diff --git a/pkg/nvidia-plugin/pkg/rm/rm_test.go b/pkg/nvidia-plugin/pkg/rm/rm_test.go new file mode 100644 index 000000000..24fbc8c11 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/rm_test.go @@ -0,0 +1,195 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" +) + +func TestValidateRequest(t *testing.T) { + testCases := []struct { + description string + devices Devices + sharing spec.Sharing + requestDevicesIDs []string + + expectedError error + }{ + { + description: "valid device IDs -- no sharing", + devices: Devices{ + "device0": nil, + "device1": nil, + }, + requestDevicesIDs: []string{"device1"}, + }, + { + description: "invalid device IDs -- no sharing", + devices: Devices{ + "device0": nil, + "device1": nil, + }, + requestDevicesIDs: []string{"device1", "device2"}, + expectedError: errInvalidRequest, + }, + { + description: "timeslicing with single device", + sharing: spec.Sharing{ + TimeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + devices: Devices{ + "device0::0": nil, + "device0::1": nil, + "device1::0": nil, + "device1::1": nil, + }, + requestDevicesIDs: []string{"device0::1"}, + }, + { + description: "timeslicing with two devices", + sharing: spec.Sharing{ + TimeSlicing: spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + devices: Devices{ + "device0::0": nil, + "device0::1": nil, + "device1::0": nil, + "device1::1": nil, + }, + requestDevicesIDs: []string{"device0::1", "device1::0"}, + }, + { + description: "timeslicing with two devices -- failRequestsGreaterThanOne", + sharing: spec.Sharing{ + TimeSlicing: spec.ReplicatedResources{ + FailRequestsGreaterThanOne: true, + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + devices: Devices{ + "device0::0": nil, + "device0::1": nil, + "device1::0": nil, + "device1::1": nil, + }, + requestDevicesIDs: []string{"device0::1", "device1::0"}, + expectedError: errInvalidRequest, + }, + { + description: "MPS with single device", + sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + devices: Devices{ + "device0::0": nil, + "device0::1": nil, + "device1::0": nil, + "device1::1": nil, + }, + requestDevicesIDs: []string{"device0::1"}, + }, + { + description: "MPS with two devices", + sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + devices: Devices{ + "device0::0": nil, + "device0::1": nil, + "device1::0": nil, + "device1::1": nil, + }, + requestDevicesIDs: []string{"device0::1", "device1::0"}, + expectedError: errInvalidRequest, + }, + { + description: "MPS with two devices -- failRequestsGreaterThanOne", + sharing: spec.Sharing{ + MPS: &spec.ReplicatedResources{ + FailRequestsGreaterThanOne: true, + Resources: []spec.ReplicatedResource{ + { + Name: "nvidia.com/gpu", + Replicas: 2, + }, + }, + }, + }, + devices: Devices{ + "device0::0": nil, + "device0::1": nil, + "device1::0": nil, + "device1::1": nil, + }, + requestDevicesIDs: []string{"device0::1", "device1::0"}, + expectedError: errInvalidRequest, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + r := resourceManager{ + config: &nvidia.DeviceConfig{ + Config: &spec.Config{ + Sharing: tc.sharing, + }, + }, + devices: tc.devices, + } + err := r.ValidateRequest(tc.requestDevicesIDs) + require.ErrorIs(t, err, tc.expectedError) + }) + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go b/pkg/nvidia-plugin/pkg/rm/tegra_devices.go similarity index 53% rename from pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go rename to pkg/nvidia-plugin/pkg/rm/tegra_devices.go index 4e824d378..44d72ec72 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go +++ b/pkg/nvidia-plugin/pkg/rm/tegra_devices.go @@ -1,34 +1,18 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ package rm @@ -83,3 +67,13 @@ func (d *tegraDevice) GetPaths() ([]string, error) { func (d *tegraDevice) GetNumaNode() (bool, int, error) { return false, -1, nil } + +// GetTotalMemory is unsupported for a Tegra device. +func (d *tegraDevice) GetTotalMemory() (uint64, error) { + return 0, nil +} + +// GetComputeCapability is unimplemented for a Tegra device. +func (d *tegraDevice) GetComputeCapability() (string, error) { + return "0.0", nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go b/pkg/nvidia-plugin/pkg/rm/tegra_manager.go similarity index 55% rename from pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go rename to pkg/nvidia-plugin/pkg/rm/tegra_manager.go index 5350ac03a..b3ae4d863 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go +++ b/pkg/nvidia-plugin/pkg/rm/tegra_manager.go @@ -1,34 +1,18 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The HAMi Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to NVIDIA CORPORATION under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. NVIDIA CORPORATION licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright The HAMi Authors. See - * GitHub history for details. - */ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ package rm @@ -51,9 +35,9 @@ func NewTegraResourceManagers(config *nvidia.DeviceConfig) ([]ResourceManager, e return nil, fmt.Errorf("error building Tegra device map: %v", err) } - deviceMap, err = updateDeviceMapWithReplicas(config, deviceMap) + deviceMap, err = updateDeviceMapWithReplicas(config.Sharing.ReplicatedResources(), deviceMap) if err != nil { - return nil, fmt.Errorf("error updating device map with replicas from config.sharing.timeSlicing.resources: %v", err) + return nil, fmt.Errorf("error updating device map with replicas from sharing resources: %v", err) } var rms []ResourceManager diff --git a/pkg/nvidia-plugin/pkg/rm/wsl_devices.go b/pkg/nvidia-plugin/pkg/rm/wsl_devices.go new file mode 100644 index 000000000..b8319409f --- /dev/null +++ b/pkg/nvidia-plugin/pkg/rm/wsl_devices.go @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +type wslDevice nvmlDevice + +var _ deviceInfo = (*wslDevice)(nil) + +// GetUUID returns the UUID of the device +func (d wslDevice) GetUUID() (string, error) { + return nvmlDevice(d).GetUUID() +} + +// GetPaths returns the paths for a tegra device. +func (d wslDevice) GetPaths() ([]string, error) { + return []string{"/dev/dxg"}, nil +} + +// GetNumaNode returns the NUMA node associated with the GPU device +func (d wslDevice) GetNumaNode() (bool, int, error) { + return nvmlDevice(d).GetNumaNode() +} + +// GetTotalMemory returns the total memory available on the device. +func (d wslDevice) GetTotalMemory() (uint64, error) { + return nvmlDevice(d).GetTotalMemory() +} + +// GetComputeCapability returns the CUDA compute capability for the device. +func (d wslDevice) GetComputeCapability() (string, error) { + return nvmlDevice(d).GetComputeCapability() +} diff --git a/pkg/nvidia-plugin/pkg/vgpu/pciutil.go b/pkg/nvidia-plugin/pkg/vgpu/pciutil.go new file mode 100644 index 000000000..ea1664961 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/vgpu/pciutil.go @@ -0,0 +1,204 @@ +/** +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package vgpu + +import ( + "fmt" + "os" + "path" + "strings" +) + +// NvidiaPCI interface allows us to get a list of all NVIDIA PCI devices +type NvidiaPCI interface { + Devices() ([]*PCIDevice, error) +} + +// PCIDevice represents a single PCI device +type PCIDevice struct { + Path string + Address string + Class string + Vendor string + Config []byte +} + +const ( + // PciDevicesRoot represents base path for all pci devices under sysfs + PciDevicesRoot = "/sys/bus/pci/devices" + // PciStatusByte indicates status byte + PciStatusByte = 0x06 + // PciStatusCapabilityList indicates if capability list is supported + PciStatusCapabilityList = 0x10 + // PciCapabilityList indicates offset of first capability list entry + PciCapabilityList = 0x34 + // PciCapabilityListID indicates offset for capability id + PciCapabilityListID = 0 + // PciCapabilityListNext indicates offset for next capability in the list + PciCapabilityListNext = 1 + // PciCapabilityLength indicates offset for capability length + PciCapabilityLength = 2 + // PciCapabilityVendorSpecificID indicates PCI vendor specific capability id + PciCapabilityVendorSpecificID = 0x09 + // PciNvidiaVendorID represents PCI vendor id for Nvidia + PciNvidiaVendorID = "0x10de" +) + +// NvidiaPCILib implements the NvidiaPCI interface +type NvidiaPCILib struct{} + +// NewNvidiaPCILib returns an instance of NvidiaPCILib implementing the NvidiaPCI interface +func NewNvidiaPCILib() NvidiaPCI { + return &NvidiaPCILib{} +} + +// Devices returns all PCI devices on the system +func (p *NvidiaPCILib) Devices() ([]*PCIDevice, error) { + deviceDirs, err := os.ReadDir(PciDevicesRoot) + if err != nil { + return nil, fmt.Errorf("unable to read PCI bus devices: %v", err) + } + + var devices []*PCIDevice + for _, deviceDir := range deviceDirs { + devicePath := path.Join(PciDevicesRoot, deviceDir.Name()) + address := deviceDir.Name() + + vendor, err := os.ReadFile(path.Join(devicePath, "vendor")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device vendor id for %s: %v", address, err) + } + + if strings.TrimSpace(string(vendor)) != PciNvidiaVendorID { + continue + } + + class, err := os.ReadFile(path.Join(devicePath, "class")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device class for %s: %v", address, err) + } + + config, err := os.ReadFile(path.Join(devicePath, "config")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI configuration space for %s: %v", address, err) + } + + device := &PCIDevice{ + Path: devicePath, + Address: address, + Vendor: strings.TrimSpace(string(vendor)), + Class: string(class)[0:4], + Config: config, + } + + devices = append(devices, device) + } + + return devices, nil +} + +// GetVendorSpecificCapability returns the vendor specific capability from configuration space +func (d *PCIDevice) GetVendorSpecificCapability() ([]byte, error) { + if len(d.Config) < 256 { + return nil, fmt.Errorf("entire PCI configuration is not read for device %s. Please run GFD with privileged mode to read complete PCI configuration data", d.Address) + } + + if d.Config[PciStatusByte]&PciStatusCapabilityList == 0 { + return nil, nil + } + + var visited [256]byte + pos := GetByte(d.Config, PciCapabilityList) + for pos != 0 { + id := GetByte(d.Config, pos+PciCapabilityListID) + next := GetByte(d.Config, pos+PciCapabilityListNext) + length := GetByte(d.Config, pos+PciCapabilityLength) + + if visited[pos] != 0 { + // chain looped + break + } + if id == 0xff { + // chain broken + break + } + if id == PciCapabilityVendorSpecificID { + capability := d.Config[pos+PciCapabilityListID : pos+PciCapabilityListID+length] + return capability, nil + } + + visited[pos]++ + pos = next + } + + return nil, nil +} + +// GetByte returns a single byte of data at specified position +func GetByte(buffer []byte, pos uint8) uint8 { + return buffer[pos] +} + +// GetWord returns 2 bytes of data from specified position +func GetWord(buffer []byte, pos int) uint16 { + return uint16(buffer[pos]) | (uint16(buffer[pos+1]) << 8) +} + +// GetLong returns 4 bytes of data from specified position +func GetLong(buffer []byte, pos int) uint32 { + return uint32(buffer[pos]) | + uint32(buffer[pos+1])<<8 | + uint32(buffer[pos+2])<<16 | + uint32(buffer[pos+3])<<24 +} + +// MockNvidiaPCI represents mock of NvidiaPCI interface +type MockNvidiaPCI struct { + devices []*PCIDevice +} + +// Devices returns PCI devices with mocked data +func (p *MockNvidiaPCI) Devices() ([]*PCIDevice, error) { + return p.devices, nil +} + +// NewMockNvidiaPCI initializes and returns mock PCI interface type +func NewMockNvidiaPCI() NvidiaPCI { + var ( + gpuPassThroughConfig = []byte{0xde, 0x10, 0x8a, 0x11, 0x07, 0x04, 0x10, 0x00, 0xa1, 0x00, 0x00, 0x03, 0x00, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0xec, 0x0c, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x01, 0xc1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x10, 0x14, 0x10, 0x00, 0x00, 0x00, 0xee, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x01, 0x00, 0x00, 0xde, 0x10, 0x14, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xce, 0xd6, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x68, 0x03, 0x00, 0x08, 0x00, 0x00, 0x00, 0x05, 0x78, 0x81, 0x00, 0x00, 0x70, 0xe6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x10, 0xb4, 0x02, 0x00, 0xe1, 0x8d, 0x64, 0x00, 0x10, 0x29, 0x00, 0x00, 0x03, 0x3d, 0x45, 0x10, 0x00, 0x00, 0x01, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x14, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} + vgpuConfig = []byte{0xde, 0x10, 0xb8, 0x1e, 0x02, 0x05, 0xff, 0x06, 0xa1, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x0c, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x10, 0x0f, 0x13, 0x00, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xce, 0xd6, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x81, 0x00, 0x00, 0x00, 0xe0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x4e, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x68, 0x1b, 0x56, 0x46, 0x00, 0x16, 0x34, 0x36, 0x30, 0x2e, 0x31, 0x36, 0x00, 0x00, 0x00, 0x00, 0x72, 0x34, 0x36, 0x30, 0x5f, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} + ) + + return &MockNvidiaPCI{ + devices: []*PCIDevice{ + { + Path: "", + Address: "passthrough", + Vendor: "0x10de", + Class: "300", + Config: gpuPassThroughConfig, + }, + { + Path: "", + Address: "vgpu", + Vendor: "0x10de", + Class: "300", + Config: vgpuConfig, + }, + }, + } +} diff --git a/pkg/nvidia-plugin/pkg/vgpu/pciutil_test.go b/pkg/nvidia-plugin/pkg/vgpu/pciutil_test.go new file mode 100644 index 000000000..eaacff609 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/vgpu/pciutil_test.go @@ -0,0 +1,42 @@ +/** +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package vgpu + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGetVendorSpecificCapability(t *testing.T) { + devices, _ := NewMockNvidiaPCI().Devices() + for _, device := range devices { + // check for vendor id + require.Equal(t, "0x10de", fmt.Sprintf("0x%x", GetWord(device.Config, 0)), "Nvidia PCI Vendor ID") + // check for vendor specific capability + capability, err := device.GetVendorSpecificCapability() + require.NoError(t, err, "Get vendor specific capability from configuration space") + require.NotZero(t, len(capability), "Vendor capability record") + if device.Address == "passthrough" { + require.Equal(t, 20, len(capability), "Vendor capability length for passthrough device") + } + if device.Address == "vgpu" { + require.Equal(t, 27, len(capability), "Vendor capability length for vgpu device") + } + } +} diff --git a/pkg/nvidia-plugin/pkg/vgpu/vgpu.go b/pkg/nvidia-plugin/pkg/vgpu/vgpu.go new file mode 100644 index 000000000..828e2cb7e --- /dev/null +++ b/pkg/nvidia-plugin/pkg/vgpu/vgpu.go @@ -0,0 +1,153 @@ +/** +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package vgpu + +import ( + "fmt" + "strings" +) + +// Interface allows us to get a list of vGPU specific PCI devices +type Interface interface { + Devices() ([]*Device, error) +} + +// Device is just an alias to a PCIDevice +type Device struct { + pci *PCIDevice + vGPUCapability []byte +} + +// Info represents vGPU driver info running on underlying hypervisor host. +type Info struct { + HostDriverVersion string + HostDriverBranch string +} + +const ( + // VGPUCapabilityRecordStart indicates offset of beginning vGPU capability record + VGPUCapabilityRecordStart uint8 = 5 + // HostDriverVersionLength indicates max length of driver version + HostDriverVersionLength = 10 + // HostDriverBranchLength indicates max length of driver branch + HostDriverBranchLength = 10 +) + +// Lib implements the NvidiaVGPU interface +type Lib struct { + pci NvidiaPCI +} + +// NewVGPULib returns an instance of Lib implementing the VGPU interface +func NewVGPULib(pci NvidiaPCI) Interface { + return &Lib{pci: pci} +} + +// NewMockVGPU initializes and returns mock Interface interface type +func NewMockVGPU() Interface { + return NewVGPULib(NewMockNvidiaPCI()) +} + +// Devices returns all vGPU devices attached to the guest +func (v *Lib) Devices() ([]*Device, error) { + pciDevices, err := v.pci.Devices() + if err != nil { + return nil, fmt.Errorf("error getting NVIDIA specific PCI devices: %v", err) + } + + var vgpus []*Device + for _, device := range pciDevices { + capability, err := device.GetVendorSpecificCapability() + if err != nil { + return nil, fmt.Errorf("unable to read vendor specific capability for %s: %v", device.Address, err) + } + if capability == nil { + continue + } + if exists := v.IsVGPUDevice(capability); exists { + vgpu := &Device{ + pci: device, + vGPUCapability: capability, + } + vgpus = append(vgpus, vgpu) + } + } + return vgpus, nil +} + +// IsVGPUDevice returns true if the device is of type vGPU +func (v *Lib) IsVGPUDevice(capability []byte) bool { + if len(capability) < 5 { + return false + } + // check for vGPU signature, 0x56, 0x46 i.e "VF" + if capability[3] != 0x56 { + return false + } + if capability[4] != 0x46 { + return false + } + return true +} + +// GetInfo returns information about vGPU manager running on the underlying hypervisor host +func (d *Device) GetInfo() (*Info, error) { + if len(d.vGPUCapability) == 0 { + return nil, fmt.Errorf("vendor capability record is not populated for device %s", d.pci.Address) + } + + // traverse vGPU vendor capability records until host driver version record(id: 0) is found + var hostDriverVersion string + var hostDriverBranch string + foundDriverVersionRecord := false + pos := VGPUCapabilityRecordStart + record := GetByte(d.vGPUCapability, VGPUCapabilityRecordStart) + for record != 0 && int(pos) < len(d.vGPUCapability) { + // find next record + recordLength := GetByte(d.vGPUCapability, pos+1) + pos += recordLength + record = GetByte(d.vGPUCapability, pos) + } + + if record == 0 && int(pos+2+HostDriverVersionLength+HostDriverBranchLength) <= len(d.vGPUCapability) { + foundDriverVersionRecord = true + // found vGPU host driver version record type + // initialized at record data byte, i.e pos + 1(record id byte) + 1(record lengh byte) + i := pos + 2 + // 10 bytes of driver version + for ; i < pos+2+HostDriverVersionLength; i++ { + hostDriverVersion += string(GetByte(d.vGPUCapability, i)) + } + hostDriverVersion = strings.Trim(hostDriverVersion, "\x00") + // 10 bytes of driver branch + for ; i < pos+2+HostDriverVersionLength+HostDriverBranchLength; i++ { + hostDriverBranch += string(GetByte(d.vGPUCapability, i)) + } + hostDriverBranch = strings.Trim(hostDriverBranch, "\x00") + } + + if !foundDriverVersionRecord { + return nil, fmt.Errorf("cannot find driver version record in vendor specific capability for device %s", d.pci.Address) + } + + info := &Info{ + HostDriverVersion: hostDriverVersion, + HostDriverBranch: hostDriverBranch, + } + + return info, nil +} diff --git a/pkg/nvidia-plugin/pkg/vgpu/vgpu_test.go b/pkg/nvidia-plugin/pkg/vgpu/vgpu_test.go new file mode 100644 index 000000000..1d8cd5e44 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/vgpu/vgpu_test.go @@ -0,0 +1,74 @@ +/** +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package vgpu + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +// MockVGPU represents mock of VGPU interface +type MockVGPU struct { + devices []*Device +} + +// Devices returns VGPU devices with mocked data +func (p *MockVGPU) Devices() ([]*Device, error) { + return p.devices, nil +} + +func TestIsVGPUDevice(t *testing.T) { + mockVGPU := NewMockVGPU().(*Lib) + devices, _ := mockVGPU.pci.Devices() + for _, device := range devices { + // check for vendor id + require.Equal(t, "0x10de", fmt.Sprintf("0x%x", GetWord(device.Config, 0)), "Nvidia PCI Vendor ID") + // check for vendor capability records + capability, err := device.GetVendorSpecificCapability() + require.NoError(t, err, "Get vendor capabilities from configuration space") + require.NotZero(t, len(capability), "Vendor capability record") + if device.Address == "passthrough" { + require.False(t, mockVGPU.IsVGPUDevice(capability), "Is not a virtual GPU device") + require.Equal(t, 20, len(capability), "Vendor capability length for passthrough device") + } + if device.Address == "vgpu" { + require.Equal(t, 27, len(capability), "Vendor capability length for vgpu device") + require.Equal(t, uint8(9), GetByte(capability, 0), "Vendor capability ID") + } + } +} + +func TestVGPUGetInfo(t *testing.T) { + devices, _ := NewMockVGPU().Devices() + for _, device := range devices { + if device.pci.Address == "vgpu" { + require.NotEmpty(t, device.pci.Config, "Device Configuration data") + require.Equal(t, len(device.pci.Config), 256, "Device configuration data length") + + require.NotEmpty(t, device.vGPUCapability, "Vendor capability record") + require.Equal(t, device.vGPUCapability[0], uint8(9), "Vendor capability id") + + info, err := device.GetInfo() + require.NoError(t, err, "Get host driver version and branch") + require.NotNil(t, info, "Host driver info") + require.Equal(t, "460.16", info.HostDriverVersion, "Host driver version") + require.Equal(t, "r460_00", info.HostDriverBranch, "Host driver branch") + } + } +} diff --git a/pkg/nvidia-plugin/pkg/watch/watchers.go b/pkg/nvidia-plugin/pkg/watch/watchers.go new file mode 100644 index 000000000..06ccceac7 --- /dev/null +++ b/pkg/nvidia-plugin/pkg/watch/watchers.go @@ -0,0 +1,49 @@ +/* +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ +package watch + +import ( + "os" + "os/signal" + + "github.com/fsnotify/fsnotify" +) + +// Files creates a Watcher for the specified files. +func Files(files ...string) (*fsnotify.Watcher, error) { + watcher, err := fsnotify.NewWatcher() + if err != nil { + return nil, err + } + + for _, f := range files { + err = watcher.Add(f) + if err != nil { + watcher.Close() + return nil, err + } + } + + return watcher, nil +} + +// Signals creats a channel for the specified signals. +func Signals(sigs ...os.Signal) chan os.Signal { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, sigs...) + + return sigChan +} diff --git a/version.mk b/version.mk index e303f2251..204237baa 100644 --- a/version.mk +++ b/version.mk @@ -1,13 +1,21 @@ -GO=go -GO111MODULE=on -CMDS=scheduler vGPUmonitor -DEVICES=nvidia -OUTPUT_DIR=bin -TARGET_ARCH=amd64 -GOLANG_IMAGE=golang:1.22.5-bullseye -NVIDIA_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu20.04 -DEST_DIR=/usr/local/vgpu/ +# Build configuration +GO := go +GO111MODULE := on +CMDS := scheduler vGPUmonitor +DEVICES := nvidia +ARCH := linux-amd64 -VERSION = v0.0.1 -IMG_NAME =hami -IMG_TAG="${IMG_NAME}:${VERSION}" \ No newline at end of file +# Path configuration +OUTPUT_DIR := bin +TARGET_ARCH := amd64 +DEST_DIR := /usr/local/vgpu + +# Base images +GOLANG_IMAGE := golang:1.22.5-bullseye +NVIDIA_DEVEL_IMAGE:= nvcr.io/nvidia/cuda:12.6.3-devel-ubuntu22.04 +NVIDIA_IMAGE := nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04 + +# Version control +VERSION := v0.0.1 +IMG_NAME := hami-device-plugin +IMG_TAG := ${IMG_NAME}:${VERSION} \ No newline at end of file From 3d965e7118352acf0fe892eec08542b1feb0d5a6 Mon Sep 17 00:00:00 2001 From: haitwang-cloud Date: Tue, 1 Apr 2025 10:57:08 +0800 Subject: [PATCH 2/5] feat: enhance GetPluginDevices to support device split count --- pkg/nvidia-plugin/pkg/plugin/server.go | 2 +- pkg/nvidia-plugin/pkg/rm/devices.go | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pkg/nvidia-plugin/pkg/plugin/server.go b/pkg/nvidia-plugin/pkg/plugin/server.go index 37643f21a..af8c3843f 100644 --- a/pkg/nvidia-plugin/pkg/plugin/server.go +++ b/pkg/nvidia-plugin/pkg/plugin/server.go @@ -659,7 +659,7 @@ func (plugin *NvidiaDevicePlugin) deviceIDsFromAnnotatedDeviceIDs(ids []string) } func (plugin *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device { - return plugin.rm.Devices().GetPluginDevices() + return plugin.rm.Devices().GetPluginDevices(plugin.schedulerConfig.DeviceSplitCount) } // updateResponseForDeviceListEnvVar sets the environment variable for the requested devices. diff --git a/pkg/nvidia-plugin/pkg/rm/devices.go b/pkg/nvidia-plugin/pkg/rm/devices.go index f3b77c5fb..150c9a19b 100644 --- a/pkg/nvidia-plugin/pkg/rm/devices.go +++ b/pkg/nvidia-plugin/pkg/rm/devices.go @@ -175,11 +175,24 @@ func (ds Devices) GetUUIDs() []string { } // GetPluginDevices returns the plugin Devices from all devices in the Devices -func (ds Devices) GetPluginDevices() []*pluginapi.Device { +func (ds Devices) GetPluginDevices(count uint) []*pluginapi.Device { var res []*pluginapi.Device - for _, device := range ds { - d := device - res = append(res, &d.Device) + if !strings.Contains(ds.GetIDs()[0], "MIG") { + for _, dev := range ds { + for i := uint(0); i < count; i++ { + id := fmt.Sprintf("%v-%v", dev.ID, i) + res = append(res, &pluginapi.Device{ + ID: id, + Health: dev.Health, + Topology: nil, + }) + } + } + } else { + for _, device := range ds { + d := device + res = append(res, &d.Device) + } } return res } From 0fc742bcd9cfd27841ee28f06d919befa4026959 Mon Sep 17 00:00:00 2001 From: haitwang-cloud Date: Tue, 1 Apr 2025 16:29:47 +0800 Subject: [PATCH 3/5] refactor: update AddDefaultResourcesToConfig function and remove unused MPS files --- cmd/device-plugin/nvidia/main.go | 2 +- pkg/nvidia-plugin/mps-control-daemon/main.go | 255 ---------------- .../mps-control-daemon/mount/mount-shm.go | 108 ------- .../mps-control-daemon/mps/daemon.go | 280 ------------------ .../mps-control-daemon/mps/device.go | 55 ---- .../mps-control-daemon/mps/device_test.go | 112 ------- .../mps-control-daemon/mps/log-tailer.go | 69 ----- .../mps-control-daemon/mps/manager.go | 112 ------- .../mps-control-daemon/mps/options.go | 29 -- .../mps-control-daemon/mps/root.go | 59 ---- pkg/nvidia-plugin/pkg/plugin/mps.go | 91 ------ pkg/nvidia-plugin/pkg/plugin/server.go | 47 +-- pkg/nvidia-plugin/pkg/rm/rm.go | 14 +- 13 files changed, 23 insertions(+), 1210 deletions(-) delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/main.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/device.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/manager.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/options.go delete mode 100644 pkg/nvidia-plugin/mps-control-daemon/mps/root.go delete mode 100644 pkg/nvidia-plugin/pkg/plugin/mps.go diff --git a/cmd/device-plugin/nvidia/main.go b/cmd/device-plugin/nvidia/main.go index cec7923ca..33a9e9071 100644 --- a/cmd/device-plugin/nvidia/main.go +++ b/cmd/device-plugin/nvidia/main.go @@ -357,7 +357,7 @@ func startPlugins(c *cli.Context, o *options) ([]plugin.Interface, bool, error) // Update the configuration file with default resources. klog.Info("Updating config with default resource matching patterns.") - err = rm.AddDefaultResourcesToConfig(infolib, nvmllib, devicelib, devConfig.Config) + err = rm.AddDefaultResourcesToConfig(infolib, nvmllib, devicelib, devConfig) if err != nil { return nil, false, fmt.Errorf("unable to add default resources to config: %v", err) } diff --git a/pkg/nvidia-plugin/mps-control-daemon/main.go b/pkg/nvidia-plugin/mps-control-daemon/main.go deleted file mode 100644 index 29259c29c..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/main.go +++ /dev/null @@ -1,255 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package main - -import ( - "encoding/json" - "errors" - "fmt" - "os" - "syscall" - "time" - - "github.com/urfave/cli/v2" - "k8s.io/klog/v2" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - "github.com/NVIDIA/go-nvml/pkg/nvml" - - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/mps-control-daemon/mount" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/mps-control-daemon/mps" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/info" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/logger" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/watch" - - spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" -) - -// Config represents a collection of config options for the device plugin. -type Config struct { - configFile string - - // flags stores the CLI flags for later processing. - flags []cli.Flag -} - -func main() { - config := &Config{} - - c := cli.NewApp() - c.Name = "NVIDIA MPS Control Daemon" - c.Version = info.GetVersionString() - c.Action = func(ctx *cli.Context) error { - return start(ctx, config) - } - c.Commands = []*cli.Command{ - mount.NewCommand(), - } - - config.flags = []cli.Flag{ - &cli.StringFlag{ - Name: "config-file", - Usage: "the path to a config file as an alternative to command line options or environment variables", - Destination: &config.configFile, - EnvVars: []string{"CONFIG_FILE"}, - }, - &cli.StringFlag{ - Name: "mig-strategy", - Value: spec.MigStrategyNone, - Usage: "the desired strategy for exposing MIG devices on GPUs that support it:\n\t\t[none | single | mixed]", - EnvVars: []string{"MIG_STRATEGY"}, - }, - } - c.Flags = config.flags - - klog.InfoS(c.Name, "version", c.Version) - err := c.Run(os.Args) - if err != nil { - klog.Error(err) - os.Exit(1) - } -} - -// TODO: This needs to do similar validation to the plugin. -func validateFlags(config *spec.Config) error { - return nil -} - -// loadConfig loads the config from the spec file. -func (cfg *Config) loadConfig(c *cli.Context) (*spec.Config, error) { - config, err := spec.NewConfig(c, cfg.flags) - if err != nil { - return nil, fmt.Errorf("unable to finalize config: %w", err) - } - err = validateFlags(config) - if err != nil { - return nil, fmt.Errorf("unable to validate flags: %w", err) - } - config.Flags.GFD = nil - - return config, nil -} - -// loadConfig loads the config from the spec file. -func (cfg *Config) loadNvidiaConfig(c *cli.Context) (*nvidia.DeviceConfig, error) { - devcfg := &nvidia.DeviceConfig{} - - config, err := spec.NewConfig(c, cfg.flags) - if err != nil { - return nil, fmt.Errorf("unable to finalize config: %w", err) - } - err = validateFlags(config) - if err != nil { - return nil, fmt.Errorf("unable to validate flags: %w", err) - } - config.Flags.GFD = nil - // Set the config in the device config. - devcfg.Config = config - return devcfg, nil -} - -func start(c *cli.Context, cfg *Config) error { - klog.Info("Starting OS watcher.") - sigs := watch.Signals(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) - var started bool - var restartTimeout <-chan time.Time - var daemons []*mps.Daemon -restart: - // If we are restarting, stop daemons from previous run. - if started { - err := stopDaemons(daemons...) - if err != nil { - return fmt.Errorf("error stopping plugins from previous run: %v", err) - } - } - - klog.Info("Starting Daemons.") - daemons, restartDaemons, err := startDaemons(c, cfg) - if err != nil { - return fmt.Errorf("error starting plugins: %v", err) - } - started = true - - if restartDaemons { - klog.Infof("Failed to start one or more MPS deamons. Retrying in 30s...") - restartTimeout = time.After(30 * time.Second) - } - - // Start an infinite loop, waiting for several indicators to either log - // some messages, trigger a restart of the plugins, or exit the program. - for { - select { - // If the restart timeout has expired, then restart the plugins - case <-restartTimeout: - goto restart - - // Watch for any signals from the OS. On SIGHUP, restart this loop, - // restarting all of the plugins in the process. On all other - // signals, exit the loop and exit the program. - case s := <-sigs: - switch s { - case syscall.SIGHUP: - klog.Info("Received SIGHUP, restarting.") - goto restart - default: - klog.Infof("Received signal \"%v\", shutting down.", s) - goto exit - } - } - } -exit: - if err := stopDaemons(daemons...); err != nil { - return fmt.Errorf("error stopping daemons: %v", err) - } - return nil -} - -func startDaemons(c *cli.Context, cfg *Config) ([]*mps.Daemon, bool, error) { - // Load the configuration file - klog.Info("Loading configuration.") - config, err := cfg.loadNvidiaConfig(c) - if err != nil { - return nil, false, fmt.Errorf("unable to load config: %v", err) - } - spec.DisableResourceNamingInConfig(logger.ToKlog, config.Config) - - nvmllib := nvml.New() - devicelib := device.New(nvmllib) - infolib := nvinfo.New( - nvinfo.WithNvmlLib(nvmllib), - nvinfo.WithDeviceLib(devicelib), - ) - - // Update the configuration file with default resources. - klog.Info("Updating config with default resource matching patterns.") - err = rm.AddDefaultResourcesToConfig(infolib, nvmllib, devicelib, config.Config) - if err != nil { - return nil, false, fmt.Errorf("unable to add default resources to config: %v", err) - } - - // Print the config to the output. - configJSON, err := json.MarshalIndent(config, "", " ") - if err != nil { - return nil, false, fmt.Errorf("failed to marshal config to JSON: %v", err) - } - klog.Infof("\nRunning with config:\n%v", string(configJSON)) - - // Get the set of daemons. - // Note that a daemon is only created for resources with at least one device. - klog.Info("Retrieving MPS daemons.") - mpsDaemons, err := mps.NewDaemons(infolib, nvmllib, devicelib, - mps.WithConfig(config), - ) - if err != nil { - return nil, false, fmt.Errorf("error getting daemons: %v", err) - } - - if len(mpsDaemons) == 0 { - klog.Info("No devices are configured for MPS sharing; Waiting indefinitely.") - } - - // Loop through all MPS daemons and start them. - // If any daemon fails to start, all daemons are started again. - for _, mpsDaemon := range mpsDaemons { - if err := mpsDaemon.Start(); err != nil { - klog.Errorf("Failed to start MPS daemon: %v", err) - return mpsDaemons, true, nil - } - } - readyFile, err := os.Create("/mps/.ready") - if err != nil { - return mpsDaemons, true, fmt.Errorf("failed to create .ready file") - } - defer readyFile.Close() - - return mpsDaemons, false, nil -} - -func stopDaemons(mpsDaemons ...*mps.Daemon) error { - if err := os.Remove("/mps/.ready"); err != nil { - klog.Warningf("Failed to remove .ready file: %v", err) - } - klog.Info("Stopping MPS daemons.") - var errs error - for _, p := range mpsDaemons { - errs = errors.Join(errs, p.Stop()) - } - return errs -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go b/pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go deleted file mode 100644 index 83825e812..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mount/mount-shm.go +++ /dev/null @@ -1,108 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mount - -import ( - "bufio" - "fmt" - "os" - "os/exec" - "strconv" - "strings" - - "github.com/urfave/cli/v2" - "k8s.io/klog/v2" - "k8s.io/mount-utils" -) - -// NewCommand constructs a mount command. -func NewCommand() *cli.Command { - c := cli.Command{ - Name: "mount-shm", - Usage: "Set up the /dev/shm mount required by the MPS daemon", - Action: mountShm, - } - - return &c -} - -// mountShm creates a tmpfs mount at /mps/shm to be used by the mps control daemon. -func mountShm(c *cli.Context) error { - mountExecutable, err := exec.LookPath("mount") - if err != nil { - return fmt.Errorf("error finding 'mount' executable: %w", err) - } - mounter := mount.New(mountExecutable) - - // TODO: /mps should be configurable. - shmDir := "/mps/shm" - err = mount.CleanupMountPoint(shmDir, mounter, true) - if err != nil { - return fmt.Errorf("error unmounting %v: %w", shmDir, err) - } - - if err := os.MkdirAll(shmDir, 0755); err != nil { - return fmt.Errorf("error creating directory %v: %w", shmDir, err) - } - - sizeArg := fmt.Sprintf("size=%v", getDefaultShmSize()) - mountOptions := []string{"rw", "nosuid", "nodev", "noexec", "relatime", sizeArg} - if err := mounter.Mount("shm", shmDir, "tmpfs", mountOptions); err != nil { - return fmt.Errorf("error mounting %v as tmpfs: %w", shmDir, err) - } - - return nil -} - -// getDefaultShmSize returns the default size for the tmpfs to be created. -// This reads /proc/meminfo to get the total memory to calculate this. If this -// fails a fallback size of 65536k is used. -func getDefaultShmSize() string { - const fallbackSize = "65536k" - - meminfo, err := os.Open("/proc/meminfo") - if err != nil { - klog.ErrorS(err, "failed to open /proc/meminfo") - return fallbackSize - } - defer func() { - _ = meminfo.Close() - }() - - scanner := bufio.NewScanner(meminfo) - for scanner.Scan() { - line := scanner.Text() - if !strings.HasPrefix(line, "MemTotal:") { - continue - } - - parts := strings.SplitN(strings.TrimSpace(strings.TrimPrefix(line, "MemTotal:")), " ", 2) - memTotal, err := strconv.Atoi(parts[0]) - if err != nil { - klog.ErrorS(err, "could not convert MemTotal to an integer") - return fallbackSize - } - - var unit string - if len(parts) == 2 { - unit = string(parts[1][0]) - } - - return fmt.Sprintf("%d%s", memTotal/2, unit) - } - return fallbackSize -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go b/pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go deleted file mode 100644 index 5d23c61ae..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mps/daemon.go +++ /dev/null @@ -1,280 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mps - -import ( - "bytes" - "errors" - "fmt" - "io" - "os" - "os/exec" - "path/filepath" - - "github.com/opencontainers/selinux/go-selinux" - "k8s.io/klog/v2" - - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" -) - -type computeMode string - -const ( - mpsControlBin = "nvidia-cuda-mps-control" - - computeModeExclusiveProcess = computeMode("EXCLUSIVE_PROCESS") - computeModeDefault = computeMode("DEFAULT") - - unprivilegedContainerSELinuxLabel = "system_u:object_r:container_file_t:s0" -) - -// Daemon represents an MPS daemon. -// It is associated with a specific kubernets resource and is responsible for -// starting and stopping the deamon as well as ensuring that the memory and -// thread limits are set for the devices that the resource makes available. -type Daemon struct { - rm rm.ResourceManager - // root represents the root at which the files and folders controlled by the - // daemon are created. These include the log and pipe directories. - root Root - // logTailer tails the MPS control daemon logs. - logTailer *tailer -} - -// NewDaemon creates an MPS daemon instance. -func NewDaemon(rm rm.ResourceManager, root Root) *Daemon { - return &Daemon{ - rm: rm, - root: root, - } -} - -// Devices returns the list of devices under the control of this MPS daemon. -func (d *Daemon) Devices() rm.Devices { - return d.rm.Devices() -} - -type envvars map[string]string - -func (e envvars) toSlice() []string { - var envs []string - for k, v := range e { - envs = append(envs, k+"="+v) - } - return envs -} - -// EnvVars returns the environment variables required for the daemon. -// These should be passed to clients consuming the device shared using MPS. -// TODO: Set CUDA_VISIBLE_DEVICES to include only the devices for this resource type. -func (d *Daemon) EnvVars() envvars { - return map[string]string{ - "CUDA_MPS_PIPE_DIRECTORY": d.PipeDir(), - "CUDA_MPS_LOG_DIRECTORY": d.LogDir(), - } -} - -// Start starts the MPS deamon as a background process. -func (d *Daemon) Start() error { - if err := d.setComputeMode(computeModeExclusiveProcess); err != nil { - return fmt.Errorf("error setting compute mode %v: %w", computeModeExclusiveProcess, err) - } - - klog.InfoS("Staring MPS daemon", "resource", d.rm.Resource()) - - pipeDir := d.PipeDir() - if err := os.MkdirAll(pipeDir, 0755); err != nil { - return fmt.Errorf("error creating directory %v: %w", pipeDir, err) - } - - if err := setSELinuxContext(pipeDir, unprivilegedContainerSELinuxLabel); err != nil { - return fmt.Errorf("error setting SELinux context: %w", err) - } - - logDir := d.LogDir() - if err := os.MkdirAll(logDir, 0755); err != nil { - return fmt.Errorf("error creating directory %v: %w", logDir, err) - } - - mpsDaemon := exec.Command(mpsControlBin, "-d") - mpsDaemon.Env = append(mpsDaemon.Env, d.EnvVars().toSlice()...) - if err := mpsDaemon.Run(); err != nil { - return err - } - - for index, limit := range d.perDevicePinnedDeviceMemoryLimits() { - _, err := d.EchoPipeToControl(fmt.Sprintf("set_default_device_pinned_mem_limit %s %s", index, limit)) - if err != nil { - return fmt.Errorf("error setting pinned memory limit for device %v: %w", index, err) - } - } - if threadPercentage := d.activeThreadPercentage(); threadPercentage != "" { - _, err := d.EchoPipeToControl(fmt.Sprintf("set_default_active_thread_percentage %s", threadPercentage)) - if err != nil { - return fmt.Errorf("error setting active thread percentage: %w", err) - } - } - - statusFile, err := os.Create(d.startedFile()) - if err != nil { - return err - } - defer statusFile.Close() - - d.logTailer = newTailer(filepath.Join(logDir, "control.log")) - klog.InfoS("Starting log tailer", "resource", d.rm.Resource()) - if err := d.logTailer.Start(); err != nil { - klog.ErrorS(err, "Could not start tail command on control.log; ignoring logs") - } - - return nil -} - -func setSELinuxContext(path string, context string) error { - _, err := os.Stat("/sys/fs/selinux") - if err != nil && errors.Is(err, os.ErrNotExist) { - klog.InfoS("SELinux disabled, not updating context", "path", path) - return nil - } else if err != nil { - return fmt.Errorf("error checking if SELinux is enabled: %w", err) - } - - klog.InfoS("SELinux enabled, setting context", "path", path, "context", context) - return selinux.Chcon(path, context, true) -} - -// Stop ensures that the MPS daemon is quit. -func (d *Daemon) Stop() error { - _, err := d.EchoPipeToControl("quit") - if err != nil { - return fmt.Errorf("error sending quit message: %w", err) - } - klog.InfoS("Stopped MPS control daemon", "resource", d.rm.Resource()) - - err = d.logTailer.Stop() - klog.InfoS("Stopped log tailer", "resource", d.rm.Resource(), "error", err) - - if err := d.setComputeMode(computeModeDefault); err != nil { - return fmt.Errorf("error setting compute mode %v: %w", computeModeDefault, err) - } - - if err := os.Remove(d.startedFile()); err != nil && err != os.ErrNotExist { - return fmt.Errorf("failed to remove started file: %w", err) - } - - logDir := d.LogDir() - if err := os.RemoveAll(logDir); err != nil { - klog.ErrorS(err, "Failed to remove pipe directory", "path", logDir) - } - - return nil -} - -func (d *Daemon) LogDir() string { - return d.root.LogDir(d.rm.Resource()) -} - -func (d *Daemon) PipeDir() string { - return d.root.PipeDir(d.rm.Resource()) -} - -func (d *Daemon) ShmDir() string { - return "/dev/shm" -} - -func (d *Daemon) startedFile() string { - return d.root.startedFile(d.rm.Resource()) -} - -// AssertHealthy checks that the MPS control daemon is healthy. -func (d *Daemon) AssertHealthy() error { - _, err := d.EchoPipeToControl("get_default_active_thread_percentage") - return err -} - -// EchoPipeToControl sends the specified command to the MPS control daemon. -func (d *Daemon) EchoPipeToControl(command string) (string, error) { - var out bytes.Buffer - reader, writer := io.Pipe() - defer writer.Close() - defer reader.Close() - - mpsDaemon := exec.Command(mpsControlBin) - mpsDaemon.Env = append(mpsDaemon.Env, d.EnvVars().toSlice()...) - - mpsDaemon.Stdin = reader - mpsDaemon.Stdout = &out - - if err := mpsDaemon.Start(); err != nil { - return "", fmt.Errorf("failed to start NVIDIA MPS command: %w", err) - } - - if _, err := writer.Write([]byte(command)); err != nil { - return "", fmt.Errorf("failed to write message to pipe: %w", err) - } - _ = writer.Close() - - if err := mpsDaemon.Wait(); err != nil { - return "", fmt.Errorf("failed to send command to MPS daemon: %w", err) - } - return out.String(), nil -} - -func (d *Daemon) setComputeMode(mode computeMode) error { - for _, uuid := range d.Devices().GetUUIDs() { - cmd := exec.Command( - "nvidia-smi", - "-i", uuid, - "-c", string(mode)) - output, err := cmd.CombinedOutput() - if err != nil { - klog.Errorf("\n%v", string(output)) - return fmt.Errorf("error running nvidia-smi: %w", err) - } - } - return nil -} - -// perDevicePinnedMemoryLimits returns the pinned memory limits for each device. -func (m *Daemon) perDevicePinnedDeviceMemoryLimits() map[string]string { - totalMemoryInBytesPerDevice := make(map[string]uint64) - replicasPerDevice := make(map[string]uint64) - for _, device := range m.Devices() { - index := device.Index - totalMemoryInBytesPerDevice[index] = device.TotalMemory - replicasPerDevice[index] += 1 - } - - limits := make(map[string]string) - for index, totalMemory := range totalMemoryInBytesPerDevice { - if totalMemory == 0 { - continue - } - replicas := replicasPerDevice[index] - limits[index] = fmt.Sprintf("%vM", totalMemory/replicas/1024/1024) - } - return limits -} - -func (m *Daemon) activeThreadPercentage() string { - if len(m.Devices()) == 0 { - return "" - } - replicasPerDevice := len(m.Devices()) / len(m.Devices().GetUUIDs()) - - return fmt.Sprintf("%d", 100/replicasPerDevice) -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/device.go b/pkg/nvidia-plugin/mps-control-daemon/mps/device.go deleted file mode 100644 index bd8b1bf3c..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mps/device.go +++ /dev/null @@ -1,55 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mps - -import ( - "errors" - "fmt" - "strings" - - "golang.org/x/mod/semver" - - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" -) - -var errInvalidDevice = errors.New("invalid device") - -// mpsDevice represents an MPS-specific alias for an rm.Device. -type mpsDevice rm.Device - -// assertReplicas checks whether the number of replicas specified is valid. -func (d *mpsDevice) assertReplicas() error { - maxClients := d.maxClients() - if d.Replicas > maxClients { - return fmt.Errorf("%w maximum allowed replicas exceeded: %d > %d", errInvalidDevice, d.Replicas, maxClients) - } - return nil -} - -// maxClients returns the maximum number of clients supported by an MPS server. -func (d *mpsDevice) maxClients() int { - if d.isAtLeastVolta() { - return 48 - } - return 16 -} - -// isAtLeastVolta checks whether the specified device is a volta device or newer. -func (d *mpsDevice) isAtLeastVolta() bool { - vCc := "v" + strings.TrimPrefix(d.ComputeCapability, "v") - return semver.Compare(semver.Canonical(vCc), semver.Canonical("v7.5")) >= 0 -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go b/pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go deleted file mode 100644 index 17cef28ea..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mps/device_test.go +++ /dev/null @@ -1,112 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mps - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestDevice(t *testing.T) { - testCases := []struct { - description string - input mpsDevice - expectedIsAtLeastVolta bool - expectedMaxClients int - expectedAssertReplicas error - }{ - { - description: "leading v ignored", - input: mpsDevice{ - ComputeCapability: "v7.5", - }, - expectedIsAtLeastVolta: true, - expectedMaxClients: 48, - }, - { - description: "no-leading v supported", - input: mpsDevice{ - ComputeCapability: "7.5", - }, - expectedIsAtLeastVolta: true, - expectedMaxClients: 48, - }, - { - description: "pre-volta clients", - input: mpsDevice{ - ComputeCapability: "7.0", - }, - expectedIsAtLeastVolta: false, - expectedMaxClients: 16, - }, - { - description: "post-volta clients", - input: mpsDevice{ - ComputeCapability: "9.0", - }, - expectedIsAtLeastVolta: true, - expectedMaxClients: 48, - }, - { - description: "pre-volta clients exceeded", - input: mpsDevice{ - ComputeCapability: "7.0", - Replicas: 29, - }, - expectedIsAtLeastVolta: false, - expectedMaxClients: 16, - expectedAssertReplicas: errInvalidDevice, - }, - { - description: "post-volta clients exceeded", - input: mpsDevice{ - ComputeCapability: "9.0", - Replicas: 49, - }, - expectedIsAtLeastVolta: true, - expectedMaxClients: 48, - expectedAssertReplicas: errInvalidDevice, - }, - { - description: "pre-volta clients max", - input: mpsDevice{ - ComputeCapability: "7.0", - Replicas: 16, - }, - expectedIsAtLeastVolta: false, - expectedMaxClients: 16, - }, - { - description: "post-volta clients max", - input: mpsDevice{ - ComputeCapability: "9.0", - Replicas: 48, - }, - expectedIsAtLeastVolta: true, - expectedMaxClients: 48, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - require.Equal(t, tc.expectedIsAtLeastVolta, tc.input.isAtLeastVolta()) - require.Equal(t, tc.expectedMaxClients, tc.input.maxClients()) - require.ErrorIs(t, tc.input.assertReplicas(), tc.expectedAssertReplicas) - }) - } -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go b/pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go deleted file mode 100644 index d9fb87b84..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mps/log-tailer.go +++ /dev/null @@ -1,69 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mps - -import ( - "context" - "os" - "os/exec" -) - -// tailer tails the contents of a file. -type tailer struct { - filename string - cmd *exec.Cmd - cancel context.CancelFunc -} - -// newTailer creates a tailer. -func newTailer(filename string) *tailer { - return &tailer{ - filename: filename, - } -} - -// Start starts tailing the specified filename. -func (t *tailer) Start() error { - ctx, cancel := context.WithCancel(context.Background()) - t.cancel = cancel - - //nolint:gosec // G204: Subprocess launched with a potential tainted input or cmd arguments (gosec) - cmd := exec.CommandContext(ctx, "tail", "-n", "+1", "-f", t.filename) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - - if err := cmd.Start(); err != nil { - return err - } - t.cmd = cmd - return nil -} - -// Stop stops the tailer. -// The associated cancel function is called after which the command wait is -// called -- if applicable. -func (t *tailer) Stop() error { - if t.cancel != nil { - t.cancel() - } - - if t.cmd == nil { - return nil - } - - return t.cmd.Wait() -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/manager.go b/pkg/nvidia-plugin/mps-control-daemon/mps/manager.go deleted file mode 100644 index 719a358e6..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mps/manager.go +++ /dev/null @@ -1,112 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mps - -import ( - "fmt" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - "github.com/NVIDIA/go-nvml/pkg/nvml" - "k8s.io/klog/v2" - - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" -) - -type Manager interface { - Daemons() ([]*Daemon, error) -} - -type manager struct { - infolib info.Interface - nvmllib nvml.Interface - devicelib device.Interface - config *nvidia.DeviceConfig -} - -type nullManager struct{} - -// Daemons creates the required set of MPS daemons for the specified options. -func NewDaemons(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, opts ...Option) ([]*Daemon, error) { - manager, err := New(infolib, nvmllib, devicelib, opts...) - if err != nil { - return nil, fmt.Errorf("failed to create MPS manager: %w", err) - } - return manager.Daemons() -} - -// New creates a manager for MPS daemons. -// If MPS is not configured, a manager is returned that manages no daemons. -func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, opts ...Option) (Manager, error) { - m := &manager{ - infolib: infolib, - nvmllib: nvmllib, - devicelib: devicelib, - } - for _, opt := range opts { - opt(m) - } - - if strategy := m.config.Sharing.SharingStrategy(); strategy != spec.SharingStrategyMPS { - klog.InfoS("Sharing strategy is not MPS; skipping MPS manager creation", "strategy", strategy) - return &nullManager{}, nil - } - - return m, nil -} - -func (m *manager) Daemons() ([]*Daemon, error) { - resourceManagers, err := rm.NewNVMLResourceManagers(m.infolib, m.nvmllib, m.devicelib, m.config) - if err != nil { - return nil, err - } - var daemons []*Daemon - for _, resourceManager := range resourceManagers { - // We don't create daemons if there are no devices associated with the resource manager. - if len(resourceManager.Devices()) == 0 { - klog.InfoS("No devices associated with resource", "resource", resourceManager.Resource()) - continue - } - // Check if the resources are shared. - // TODO: We should add a more explicit check for MPS specifically - if !rm.AnnotatedIDs(resourceManager.Devices().GetIDs()).AnyHasAnnotations() { - klog.InfoS("Resource is not shared", "resource", "resource", resourceManager.Resource()) - continue - } - // Check if MIG devices are included. - for _, rmDevice := range resourceManager.Devices() { - if rmDevice.IsMigDevice() { - klog.Warning("MPS sharing is not supported for MIG devices; skipping daemon creation") - continue - } - if err := (*mpsDevice)(rmDevice).assertReplicas(); err != nil { - return nil, fmt.Errorf("invalid MPS configuration: %w", err) - } - } - daemon := NewDaemon(resourceManager, ContainerRoot) - daemons = append(daemons, daemon) - } - - return daemons, nil -} - -// Daemons always returns an empty slice for a nullManager. -func (m *nullManager) Daemons() ([]*Daemon, error) { - return nil, nil -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/options.go b/pkg/nvidia-plugin/mps-control-daemon/mps/options.go deleted file mode 100644 index ca97d122f..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mps/options.go +++ /dev/null @@ -1,29 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mps - -import "github.com/Project-HAMi/HAMi/pkg/device/nvidia" - -// Option defines a functional option for configuring an MPS manager. -type Option func(*manager) - -// WithConfig sets the config associated with the MPS manager. -func WithConfig(config *nvidia.DeviceConfig) Option { - return func(m *manager) { - m.config = config - } -} diff --git a/pkg/nvidia-plugin/mps-control-daemon/mps/root.go b/pkg/nvidia-plugin/mps-control-daemon/mps/root.go deleted file mode 100644 index 9c2e105f8..000000000 --- a/pkg/nvidia-plugin/mps-control-daemon/mps/root.go +++ /dev/null @@ -1,59 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package mps - -import ( - "path/filepath" - - spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" -) - -const ( - ContainerRoot = Root("/mps") -) - -// Root represents an MPS root. -// This is where per-resource pipe and log directories are created. -// For containerised applications the host root is typically mounted to /mps in the container. -type Root string - -// LogDir returns the per-resource pipe dir for the specified root. -func (r Root) LogDir(resourceName spec.ResourceName) string { - return r.Path(string(resourceName), "log") -} - -// PipeDir returns the per-resource pipe dir for the specified root. -func (r Root) PipeDir(resourceName spec.ResourceName) string { - return r.Path(string(resourceName), "pipe") -} - -// ShmDir returns the shm dir associated with the root. -// Note that the shm dir is the same for all resources. -func (r Root) ShmDir(resourceName spec.ResourceName) string { - return r.Path("shm") -} - -// startedFile returns the per-resource .started file name for the specified root. -func (r Root) startedFile(resourceName spec.ResourceName) string { - return r.Path(string(resourceName), ".started") -} - -// Path returns a path relative to the MPS root. -func (r Root) Path(parts ...string) string { - pathparts := append([]string{string(r)}, parts...) - return filepath.Join(pathparts...) -} diff --git a/pkg/nvidia-plugin/pkg/plugin/mps.go b/pkg/nvidia-plugin/pkg/plugin/mps.go deleted file mode 100644 index c4b304f07..000000000 --- a/pkg/nvidia-plugin/pkg/plugin/mps.go +++ /dev/null @@ -1,91 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package plugin - -import ( - "errors" - "fmt" - - "k8s.io/klog/v2" - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - - spec "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/api/config/v1" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/mps-control-daemon/mps" - "github.com/Project-HAMi/HAMi/pkg/nvidia-plugin/pkg/rm" -) - -type mpsOptions struct { - enabled bool - resourceName spec.ResourceName - daemon *mps.Daemon - hostRoot mps.Root -} - -// getMPSOptions returns the MPS options specified for the resource manager. -// If MPS is not configured and empty set of options is returned. -func (o *options) getMPSOptions(resourceManager rm.ResourceManager) (mpsOptions, error) { - if o.config.Sharing.SharingStrategy() != spec.SharingStrategyMPS { - return mpsOptions{}, nil - } - - // TODO: It might make sense to pull this logic into a resource manager. - for _, device := range resourceManager.Devices() { - if device.IsMigDevice() { - return mpsOptions{}, errors.New("sharing using MPS is not supported for MIG devices") - } - } - - m := mpsOptions{ - enabled: true, - resourceName: resourceManager.Resource(), - daemon: mps.NewDaemon(resourceManager, mps.ContainerRoot), - hostRoot: mps.Root(*o.config.Flags.CommandLineFlags.MpsRoot), - } - return m, nil -} - -func (m *mpsOptions) waitForDaemon() error { - if m == nil || !m.enabled { - return nil - } - // TODO: Check the .ready file here. - // TODO: Have some retry strategy here. - if err := m.daemon.AssertHealthy(); err != nil { - return fmt.Errorf("error checking MPS daemon health: %w", err) - } - klog.InfoS("MPS daemon is healthy", "resource", m.resourceName) - return nil -} - -func (m *mpsOptions) updateReponse(response *pluginapi.ContainerAllocateResponse) { - if m == nil || !m.enabled { - return - } - // TODO: We should check that the deviceIDs are shared using MPS. - response.Envs["CUDA_MPS_PIPE_DIRECTORY"] = m.daemon.PipeDir() - - response.Mounts = append(response.Mounts, - &pluginapi.Mount{ - ContainerPath: m.daemon.PipeDir(), - HostPath: m.hostRoot.PipeDir(m.resourceName), - }, - &pluginapi.Mount{ - ContainerPath: m.daemon.ShmDir(), - HostPath: m.hostRoot.ShmDir(m.resourceName), - }, - ) -} diff --git a/pkg/nvidia-plugin/pkg/plugin/server.go b/pkg/nvidia-plugin/pkg/plugin/server.go index af8c3843f..8f562875d 100644 --- a/pkg/nvidia-plugin/pkg/plugin/server.go +++ b/pkg/nvidia-plugin/pkg/plugin/server.go @@ -80,8 +80,6 @@ type NvidiaDevicePlugin struct { imexChannels imex.Channels - mps mpsOptions - operatingMode string migCurrent nvidia.MigPartedSpec schedulerConfig nvidia.NvidiaConfig @@ -89,10 +87,6 @@ type NvidiaDevicePlugin struct { // devicePluginForResource creates a device plugin for the specified resource. func (o *options) devicePluginForResource(resourceManager rm.ResourceManager) (Interface, error) { - mpsOptions, err := o.getMPSOptions(resourceManager) - if err != nil { - return nil, err - } sConfig, mode, err := LoadNvidiaDevicePluginConfig() if err != nil { return nil, fmt.Errorf("failed to load nvidia plugin config: %v", err) @@ -113,8 +107,6 @@ func (o *options) devicePluginForResource(resourceManager rm.ResourceManager) (I imexChannels: o.imexChannels, - mps: mpsOptions, - socket: getPluginSocketPath(resourceManager.Resource()), // These will be reinitialized every // time the plugin server is restarted. @@ -208,10 +200,6 @@ func (plugin *NvidiaDevicePlugin) Devices() rm.Devices { func (plugin *NvidiaDevicePlugin) Start(kubeletSocket string) error { plugin.initialize() - if err := plugin.mps.waitForDaemon(); err != nil { - return fmt.Errorf("error waiting for MPS daemon: %w", err) - } - err := plugin.Serve() if err != nil { klog.Errorf("Could not start device plugin for '%s': %s", plugin.rm.Resource(), err) @@ -251,7 +239,6 @@ func (plugin *NvidiaDevicePlugin) Start(kubeletSocket string) error { } go func() { - // TODO: add MPS health check err := plugin.rm.CheckHealth(plugin.stop, plugin.health) if err != nil { klog.Errorf("Failed to start health check: %v; continuing with health checks disabled", err) @@ -410,14 +397,16 @@ func (plugin *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r // Allocate which return list of devices. func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { - responses := pluginapi.AllocateResponse{} + klog.InfoS("Allocate", "request", reqs) + responses := pluginapi.AllocateResponse{} nodeName := os.Getenv(util.NodeNameEnvName) + klog.Infof("Allocate request on node %s", nodeName) current, err := util.GetPendingPod(ctx, nodeName) if err != nil { return &responses, err } - + klog.Infof("Allocate pod name is %s/%s, annotation is %+v", current.Namespace, current.Name, current.Annotations) for idx, req := range reqs.ContainerRequests { if err := plugin.rm.ValidateRequest(req.DevicesIDs); err != nil { return nil, fmt.Errorf("invalid allocation request for %q: %w", plugin.rm.Resource(), err) @@ -509,7 +498,8 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. } responses.ContainerResponses = append(responses.ContainerResponses, response) } - + klog.Infof("Final allocate response: %v", responses) + device.PodAllocationTrySuccess(nodeName, nvidia.NvidiaGPUDevice, NodeLockNvidia, current) return &responses, nil } @@ -526,23 +516,19 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu return nil, fmt.Errorf("failed to get allocate response for CDI: %v", err) } } - if plugin.mps.enabled { - plugin.updateResponseForMPS(response) - } - // The following modifications are only made if at least one non-CDI device // list strategy is selected. if plugin.deviceListStrategies.AllCDIEnabled() { return response, nil } - if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyEnvVar) { - plugin.updateResponseForDeviceListEnvVar(response, deviceIDs...) - plugin.updateResponseForImexChannelsEnvVar(response) - } - if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyVolumeMounts) { - plugin.updateResponseForDeviceMounts(response, deviceIDs...) - } + // if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyEnvVar) { + // plugin.updateResponseForDeviceListEnvVar(response, deviceIDs...) + // plugin.updateResponseForImexChannelsEnvVar(response) + // } + // if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyVolumeMounts) { + // plugin.updateResponseForDeviceMounts(response, deviceIDs...) + // } if *plugin.config.Flags.Plugin.PassDeviceSpecs { response.Devices = append(response.Devices, plugin.apiDeviceSpecs(*plugin.config.Flags.NvidiaDevRoot, requestIds)...) } @@ -555,13 +541,6 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu return response, nil } -// updateResponseForMPS ensures that the ContainerAllocate response contains the information required to use MPS. -// This includes per-resource pipe and log directories as well as a global daemon-specific shm -// and assumes that an MPS control daemon has already been started. -func (plugin NvidiaDevicePlugin) updateResponseForMPS(response *pluginapi.ContainerAllocateResponse) { - plugin.mps.updateReponse(response) -} - // updateResponseForCDI updates the specified response for the given device IDs. // This response contains the annotations required to trigger CDI injection in the container engine or nvidia-container-runtime. func (plugin *NvidiaDevicePlugin) updateResponseForCDI(response *pluginapi.ContainerAllocateResponse, responseID string, deviceIDs ...string) error { diff --git a/pkg/nvidia-plugin/pkg/rm/rm.go b/pkg/nvidia-plugin/pkg/rm/rm.go index 5267b60b1..4747a546d 100644 --- a/pkg/nvidia-plugin/pkg/rm/rm.go +++ b/pkg/nvidia-plugin/pkg/rm/rm.go @@ -95,11 +95,15 @@ func (r *resourceManager) ValidateRequest(ids AnnotatedIDs) error { } // AddDefaultResourcesToConfig adds default resource matching rules to config.Resources -func AddDefaultResourcesToConfig(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, config *spec.Config) error { - _ = config.Resources.AddGPUResource("*", "gpu") - if config.Flags.MigStrategy == nil { - return nil - } +func AddDefaultResourcesToConfig(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, config *nvidia.DeviceConfig) error { + // _ = config.Resources.AddGPUResource("*", "gpu") + // if config.Flags.MigStrategy == nil { + // return nil + // } + config.Resources.GPUs = append(config.Resources.GPUs, spec.Resource{ + Pattern: "*", + Name: spec.ResourceName(*config.ResourceName), + }) switch *config.Flags.MigStrategy { case spec.MigStrategySingle: return config.Resources.AddMIGResource("*", "gpu") From 0f895a109a28b8495c081b3ebf388a0b86032c25 Mon Sep 17 00:00:00 2001 From: haitwang-cloud Date: Wed, 2 Apr 2025 18:02:07 +0800 Subject: [PATCH 4/5] refactor: remove unnecessary logging statements and improve device processing logs --- pkg/device/devices.go | 23 ++++- pkg/nvidia-plugin/pkg/plugin/register.go | 7 +- pkg/nvidia-plugin/pkg/plugin/server.go | 105 +++++++++++++++-------- pkg/nvidia-plugin/pkg/plugin/util.go | 1 - pkg/nvidia-plugin/pkg/rm/devices.go | 18 +++- pkg/util/util.go | 2 - 6 files changed, 108 insertions(+), 48 deletions(-) diff --git a/pkg/device/devices.go b/pkg/device/devices.go index 019bd5d63..a40cb72b3 100644 --- a/pkg/device/devices.go +++ b/pkg/device/devices.go @@ -319,19 +319,34 @@ vnpus: } func PodAllocationTrySuccess(nodeName string, devName string, lockName string, pod *corev1.Pod) { + // 日志:开始尝试分配成功逻辑 + klog.InfoS("Starting PodAllocationTrySuccess", "nodeName", nodeName, "deviceName", devName, "lockName", lockName, "namespace", pod.Namespace, "podName", pod.Name) + + // 获取最新的 Pod 信息 refreshed, err := client.GetClient().CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{}) if err != nil { - klog.Errorf("Error getting pod %s/%s: %v", pod.Namespace, pod.Name, err) + klog.ErrorS(err, "Failed to get refreshed pod", "nodeName", nodeName, "namespace", pod.Namespace, "podName", pod.Name) + return + } + + // 获取设备相关的注释信息 + annos, exists := refreshed.Annotations[util.InRequestDevices[devName]] + if !exists { + klog.Warningf("Annotation for device %s not found in pod %s/%s", devName, pod.Namespace, pod.Name) return } - annos := refreshed.Annotations[util.InRequestDevices[devName]] - klog.Infof("Trying allocation success: %s", annos) + klog.InfoS("Processing allocation success", "annotations", annos, "namespace", pod.Namespace, "podName", pod.Name) + + // 检查是否还有未处理的设备 for _, val := range DevicesToHandle { if strings.Contains(annos, val) { + klog.Infof("Device %s still pending allocation for pod %s/%s", val, pod.Namespace, pod.Name) return } } - klog.Infof("All devices allocate success, releasing lock") + + // 如果所有设备都已成功分配,释放锁 + klog.InfoS("All devices allocated successfully, releasing lock", "namespace", pod.Namespace, "podName", pod.Name) PodAllocationSuccess(nodeName, pod, lockName) } diff --git a/pkg/nvidia-plugin/pkg/plugin/register.go b/pkg/nvidia-plugin/pkg/plugin/register.go index c78141ea9..10ce458f0 100644 --- a/pkg/nvidia-plugin/pkg/plugin/register.go +++ b/pkg/nvidia-plugin/pkg/plugin/register.go @@ -141,7 +141,6 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { if plugin.schedulerConfig.DeviceMemoryScaling != 1 { registeredmem = int32(float64(registeredmem) * plugin.schedulerConfig.DeviceMemoryScaling) } - klog.Infoln("MemoryScaling=", plugin.schedulerConfig.DeviceMemoryScaling, "registeredmem=", registeredmem) health := true for _, val := range devs { if strings.Compare(val.ID, UUID) == 0 { @@ -170,7 +169,6 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { Mode: plugin.operatingMode, Health: health, }) - klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v", idx, registeredmem, Model, numa) } return &res } @@ -187,7 +185,6 @@ func (plugin *NvidiaDevicePlugin) RegistrInAnnotation() error { encodeddevices := util.EncodeNodeDevices(*devices) annos[nvidia.HandshakeAnnos] = "Reported " + time.Now().String() annos[nvidia.RegisterAnnos] = encodeddevices - klog.Infof("patch node with the following annos %v", fmt.Sprintf("%v", annos)) err = util.PatchNodeAnnotations(node, annos) if err != nil { @@ -203,11 +200,9 @@ func (plugin *NvidiaDevicePlugin) WatchAndRegister() { for { err := plugin.RegistrInAnnotation() if err != nil { - klog.Errorf("Failed to register annotation: %v", err) - klog.Infof("Retrying in %v seconds...", errorSleepInterval) + klog.Errorf("Failed to register annotation: %v. Retrying in %v...", err, errorSleepInterval) time.Sleep(errorSleepInterval) } else { - klog.Infof("Successfully registered annotation. Next check in %v seconds...", successSleepInterval) time.Sleep(successSleepInterval) } } diff --git a/pkg/nvidia-plugin/pkg/plugin/server.go b/pkg/nvidia-plugin/pkg/plugin/server.go index 8f562875d..94b3cfd14 100644 --- a/pkg/nvidia-plugin/pkg/plugin/server.go +++ b/pkg/nvidia-plugin/pkg/plugin/server.go @@ -337,7 +337,7 @@ func (plugin *NvidiaDevicePlugin) Register(kubeletSocket string) error { Endpoint: path.Base(plugin.socket), ResourceName: string(plugin.rm.Resource()), Options: &pluginapi.DevicePluginOptions{ - GetPreferredAllocationAvailable: true, + GetPreferredAllocationAvailable: false, }, } @@ -351,7 +351,7 @@ func (plugin *NvidiaDevicePlugin) Register(kubeletSocket string) error { // GetDevicePluginOptions returns the values of the optional settings for this plugin func (plugin *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { options := &pluginapi.DevicePluginOptions{ - GetPreferredAllocationAvailable: true, + GetPreferredAllocationAvailable: false, } return options, nil } @@ -397,42 +397,60 @@ func (plugin *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r // Allocate which return list of devices. func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { + // 日志:函数开始 + klog.InfoS("Allocate function started", "request", reqs) - klog.InfoS("Allocate", "request", reqs) responses := pluginapi.AllocateResponse{} nodeName := os.Getenv(util.NodeNameEnvName) - klog.Infof("Allocate request on node %s", nodeName) + klog.InfoS("Processing allocate request on node", "nodeName", nodeName) + + // 获取当前待处理的 Pod 信息 current, err := util.GetPendingPod(ctx, nodeName) if err != nil { + klog.ErrorS(err, "Failed to get pending pod", "nodeName", nodeName) return &responses, err } - klog.Infof("Allocate pod name is %s/%s, annotation is %+v", current.Namespace, current.Name, current.Annotations) - for idx, req := range reqs.ContainerRequests { - if err := plugin.rm.ValidateRequest(req.DevicesIDs); err != nil { - return nil, fmt.Errorf("invalid allocation request for %q: %w", plugin.rm.Resource(), err) - } + klog.InfoS("Processing allocate request for pod", "namespace", current.Namespace, "name", current.Name, "annotations", current.Annotations) + + for idx, _ := range reqs.ContainerRequests { + containerIndex := idx + 1 + klog.InfoS("Processing container request", "containerIndex", containerIndex, "totalContainers", len(reqs.ContainerRequests), "namespace", current.Namespace, "podName", current.Name) + + //if err := plugin.rm.ValidateRequest(req.DevicesIDs); err != nil { + // klog.ErrorS(err, "Invalid allocation request", "resource", plugin.rm.Resource(), "devicesIDs", req.DevicesIDs, "namespace", current.Namespace, "podName", current.Name) + // return nil, fmt.Errorf("invalid allocation request for %q: %w", plugin.rm.Resource(), err) + //} + currentCtr, devreq, err := GetNextDeviceRequest(nvidia.NvidiaGPUDevice, *current) - klog.Infoln("deviceAllocateFromAnnotation=", devreq) if err != nil { + klog.ErrorS(err, "Failed to get next device request", "nodeName", nodeName, "namespace", current.Namespace, "podName", current.Name) device.PodAllocationFailed(nodeName, current, NodeLockNvidia) return &responses, err } + if len(devreq) != len(reqs.ContainerRequests[idx].DevicesIDs) { + err := errors.New("device number not matched") + klog.ErrorS(err, "Device number mismatch", "expected", len(reqs.ContainerRequests[idx].DevicesIDs), "got", len(devreq), "namespace", current.Namespace, "podName", current.Name) device.PodAllocationFailed(nodeName, current, NodeLockNvidia) - return &responses, errors.New("device number not matched") + return &responses, err } + response, err := plugin.getAllocateResponse(plugin.GetContainerDeviceStrArray(devreq)) if err != nil { + klog.ErrorS(err, "Failed to get allocate response", "namespace", current.Namespace, "podName", current.Name) return nil, fmt.Errorf("failed to get allocate response: %v", err) } - err = EraseNextDeviceTypeFromAnnotation(nvidia.NvidiaGPUDevice, *current) - if err != nil { + if err := EraseNextDeviceTypeFromAnnotation(nvidia.NvidiaGPUDevice, *current); err != nil { + klog.ErrorS(err, "Failed to erase next device type from annotation", "namespace", current.Namespace, "podName", current.Name) device.PodAllocationFailed(nodeName, current, NodeLockNvidia) return &responses, err } if plugin.operatingMode != "mig" { + klog.InfoS("Starting to allocate devices for pod", "namespace", current.Namespace, "podName", current.Name) + + response.Envs["NVIDIA_VISIBLE_DEVICES"] = strings.Join(reqs.ContainerRequests[idx].DevicesIDs, ",") for i, dev := range devreq { limitKey := fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v", i) response.Envs[limitKey] = fmt.Sprintf("%vm", dev.Usedmem) @@ -445,43 +463,53 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. if plugin.schedulerConfig.DisableCoreLimit { response.Envs[util.CoreLimitSwitch] = "disable" } + cacheFileHostDirectory := fmt.Sprintf("%s/vgpu/containers/%s_%s", hostHookPath, current.UID, currentCtr.Name) os.RemoveAll(cacheFileHostDirectory) - + klog.InfoS("Creating cache file host directory for pod", "namespace", current.Namespace, "podName", current.Name) os.MkdirAll(cacheFileHostDirectory, 0777) os.Chmod(cacheFileHostDirectory, 0777) os.MkdirAll("/tmp/vgpulock", 0777) os.Chmod("/tmp/vgpulock", 0777) + response.Mounts = append(response.Mounts, - &pluginapi.Mount{ContainerPath: fmt.Sprintf("%s/vgpu/libvgpu.so", hostHookPath), - HostPath: GetLibPath(), - ReadOnly: true}, - &pluginapi.Mount{ContainerPath: fmt.Sprintf("%s/vgpu", hostHookPath), - HostPath: cacheFileHostDirectory, - ReadOnly: false}, - &pluginapi.Mount{ContainerPath: "/tmp/vgpulock", - HostPath: "/tmp/vgpulock", - ReadOnly: false}, + &pluginapi.Mount{ + ContainerPath: fmt.Sprintf("%s/vgpu/libvgpu.so", hostHookPath), + HostPath: GetLibPath(), + ReadOnly: true, + }, + &pluginapi.Mount{ + ContainerPath: fmt.Sprintf("%s/vgpu", hostHookPath), + HostPath: cacheFileHostDirectory, + ReadOnly: false, + }, + &pluginapi.Mount{ + ContainerPath: "/tmp/vgpulock", + HostPath: "/tmp/vgpulock", + ReadOnly: false, + }, ) + + // 检查 CUDA_DISABLE_CONTROL 环境变量是否存在 found := false for _, val := range currentCtr.Env { if strings.Compare(val.Name, "CUDA_DISABLE_CONTROL") == 0 { - // if env existed but is set to false or can not be parsed, ignore t, _ := strconv.ParseBool(val.Value) - if !t { - continue + if t { + found = true + break } - // only env existed and set to true, we mark it "found" - found = true - break } } if !found { - response.Mounts = append(response.Mounts, &pluginapi.Mount{ContainerPath: "/etc/ld.so.preload", - HostPath: hostHookPath + "/vgpu/ld.so.preload", - ReadOnly: true}, - ) + response.Mounts = append(response.Mounts, &pluginapi.Mount{ + ContainerPath: "/etc/ld.so.preload", + HostPath: hostHookPath + "/vgpu/ld.so.preload", + ReadOnly: true, + }) } + + // 检查许可证文件是否存在 _, err = os.Stat(fmt.Sprintf("%s/vgpu/license", hostHookPath)) if err == nil { response.Mounts = append(response.Mounts, &pluginapi.Mount{ @@ -496,10 +524,19 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. }) } } + + // 将容器响应追加到最终响应中 + klog.InfoS("Appending container response", "containerIndex", containerIndex, "totalContainers", len(reqs.ContainerRequests), "namespace", current.Namespace, "podName", current.Name) responses.ContainerResponses = append(responses.ContainerResponses, response) } - klog.Infof("Final allocate response: %v", responses) + + // 日志:最终分配响应 + klog.InfoS("Final allocate response generated", "response", responses) + + // 标记 Pod 分配成功 device.PodAllocationTrySuccess(nodeName, nvidia.NvidiaGPUDevice, NodeLockNvidia, current) + klog.InfoS("Allocate function completed successfully", "response", responses) + return &responses, nil } diff --git a/pkg/nvidia-plugin/pkg/plugin/util.go b/pkg/nvidia-plugin/pkg/plugin/util.go index 2b16900b3..3e2e248a9 100644 --- a/pkg/nvidia-plugin/pkg/plugin/util.go +++ b/pkg/nvidia-plugin/pkg/plugin/util.go @@ -85,7 +85,6 @@ func EraseNextDeviceTypeFromAnnotation(dtype string, p corev1.Pod) error { } } } - klog.Infoln("After erase res=", res) newannos := make(map[string]string) newannos[util.InRequestDevices[dtype]] = util.EncodePodSingleDevice(res) return util.PatchPodAnnotations(&p, newannos) diff --git a/pkg/nvidia-plugin/pkg/rm/devices.go b/pkg/nvidia-plugin/pkg/rm/devices.go index 150c9a19b..ac1af5cb7 100644 --- a/pkg/nvidia-plugin/pkg/rm/devices.go +++ b/pkg/nvidia-plugin/pkg/rm/devices.go @@ -21,6 +21,7 @@ import ( "strconv" "strings" + "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) @@ -177,10 +178,21 @@ func (ds Devices) GetUUIDs() []string { // GetPluginDevices returns the plugin Devices from all devices in the Devices func (ds Devices) GetPluginDevices(count uint) []*pluginapi.Device { var res []*pluginapi.Device - if !strings.Contains(ds.GetIDs()[0], "MIG") { + + // Log the IDs of all devices + ids := ds.GetIDs() + if len(ids) == 0 { + klog.Warning("No devices found in Devices map") + return res + } + klog.InfoS("Processing devices", "deviceIDs", ids, "count", count) + + if !strings.Contains(ids[0], "MIG") { + klog.Info("Devices are not MIG-enabled, generating plugin devices with replicas") for _, dev := range ds { for i := uint(0); i < count; i++ { id := fmt.Sprintf("%v-%v", dev.ID, i) + klog.InfoS("Adding device", "deviceID", id, "health", dev.Health) res = append(res, &pluginapi.Device{ ID: id, Health: dev.Health, @@ -189,11 +201,15 @@ func (ds Devices) GetPluginDevices(count uint) []*pluginapi.Device { } } } else { + klog.Info("Devices are MIG-enabled, adding directly") for _, device := range ds { d := device + klog.InfoS("Adding MIG device", "deviceID", d.ID, "health", d.Health) res = append(res, &d.Device) } } + + klog.InfoS("Finished processing devices", "totalDevices", len(res)) return res } diff --git a/pkg/util/util.go b/pkg/util/util.go index 4a4bb2dba..fbd3e69c6 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -231,7 +231,6 @@ func EncodeContainerDevices(cd ContainerDevices) string { for _, val := range cd { tmp += val.UUID + "," + val.Type + "," + strconv.Itoa(int(val.Usedmem)) + "," + strconv.Itoa(int(val.Usedcores)) + OneContainerMultiDeviceSplitSymbol } - klog.Infof("Encoded container Devices: %s", tmp) return tmp //return strings.Join(cd, ",") } @@ -254,7 +253,6 @@ func EncodePodSingleDevice(pd PodSingleDevice) string { res = res + EncodeContainerDevices(ctrdevs) res = res + OnePodMultiContainerSplitSymbol } - klog.Infof("Encoded pod single devices %s", res) return res } From feaac9971ed76f37f44405cecfe420bf6e6f8750 Mon Sep 17 00:00:00 2001 From: haitwang-cloud Date: Wed, 9 Apr 2025 14:09:14 +0800 Subject: [PATCH 5/5] feat: add error logging and improve device information collection in NVIDIA plugin --- logs.txt | 1 + pkg/nvidia-plugin/pkg/cdi/cdi.go | 10 +- pkg/nvidia-plugin/pkg/plugin/register.go | 132 ++++++++++++++++++----- pkg/nvidia-plugin/pkg/plugin/server.go | 39 ++++--- 4 files changed, 141 insertions(+), 41 deletions(-) create mode 100644 logs.txt diff --git a/logs.txt b/logs.txt new file mode 100644 index 000000000..f072e0b0c --- /dev/null +++ b/logs.txt @@ -0,0 +1 @@ +Error: failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running hook #0: error running hook: exit status 1, stdout: , stderr: nvidia-container-cli.real: device error: GPU-3c955d4d-eea8-345a-5477-85f2c1d1ae7b-4: unknown device: unknown \ No newline at end of file diff --git a/pkg/nvidia-plugin/pkg/cdi/cdi.go b/pkg/nvidia-plugin/pkg/cdi/cdi.go index b3227d437..789ebacf2 100644 --- a/pkg/nvidia-plugin/pkg/cdi/cdi.go +++ b/pkg/nvidia-plugin/pkg/cdi/cdi.go @@ -227,5 +227,13 @@ func (cdi *cdiHandler) getRootTransformer() transform.Transformer { // QualifiedName constructs a CDI qualified device name for the specified resources. // Note: This assumes that the specified id matches the device name returned by the naming strategy. func (cdi *cdiHandler) QualifiedName(class string, id string) string { - return cdiparser.QualifiedName(cdi.vendor, class, id) + if id == "" { + klog.Error("Empty device ID received") + return "" + } + + name := cdiparser.QualifiedName(cdi.vendor, class, id) + klog.Infof("CDI name generated - Vendor: %s, Class: %s, ID: %s -> %s", + cdi.vendor, class, id, name) + return name } diff --git a/pkg/nvidia-plugin/pkg/plugin/register.go b/pkg/nvidia-plugin/pkg/plugin/register.go index 10ce458f0..cd59a6a24 100644 --- a/pkg/nvidia-plugin/pkg/plugin/register.go +++ b/pkg/nvidia-plugin/pkg/plugin/register.go @@ -108,56 +108,106 @@ func parseNvidiaNumaInfo(idx int, nvidiaTopoStr string) (int, error) { } func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { + // Initialize NVML and get device list devs := plugin.Devices() - klog.V(5).InfoS("getAPIDevices", "devices", devs) - nvml.Init() + klog.InfoS("Starting to collect GPU device information", "deviceCount", len(devs)) + + // Initialize NVML library + if ret := nvml.Init(); ret != nvml.SUCCESS { + errMsg := nvml.ErrorString(ret) + klog.ErrorS(fmt.Errorf(errMsg), "Failed to initialize NVML") + return &[]*util.DeviceInfo{} + } + defer nvml.Shutdown() + res := make([]*util.DeviceInfo, 0, len(devs)) + var errorCount int + + // Process each GPU device for UUID := range devs { + // Get device handle by UUID ndev, ret := nvml.DeviceGetHandleByUUID(UUID) if ret != nvml.SUCCESS { - klog.Errorln("nvml new device by index error uuid=", UUID, "err=", ret) - panic(0) + errMsg := nvml.ErrorString(ret) + klog.ErrorS(fmt.Errorf(errMsg), "Failed to get device handle", + "uuid", UUID, "errorCode", ret) + errorCount++ + continue } + + // Get device index idx, ret := ndev.GetIndex() if ret != nvml.SUCCESS { - klog.Errorln("nvml get index error ret=", ret) - panic(0) + errMsg := nvml.ErrorString(ret) + klog.ErrorS(fmt.Errorf(errMsg), "Failed to get device index", + "uuid", UUID, "errorCode", ret) + errorCount++ + continue } - memoryTotal := 0 + + // Get memory information memory, ret := ndev.GetMemoryInfo() - if ret == nvml.SUCCESS { - memoryTotal = int(memory.Total) - } else { - klog.Error("nvml get memory error ret=", ret) - panic(0) - } - Model, ret := ndev.GetName() if ret != nvml.SUCCESS { - klog.Error("nvml get name error ret=", ret) - panic(0) + errMsg := nvml.ErrorString(ret) + klog.ErrorS(fmt.Errorf(errMsg), "Failed to get memory info", + "uuid", UUID, "index", idx) + errorCount++ + continue } + memoryTotal := int(memory.Total) + // Calculate registered memory with scaling factor registeredmem := int32(memoryTotal / 1024 / 1024) if plugin.schedulerConfig.DeviceMemoryScaling != 1 { + original := registeredmem registeredmem = int32(float64(registeredmem) * plugin.schedulerConfig.DeviceMemoryScaling) + klog.V(4).InfoS("Applied memory scaling", + "originalMB", original, + "scaledMB", registeredmem, + "scalingFactor", plugin.schedulerConfig.DeviceMemoryScaling) + } + + // Get device model name + Model, ret := ndev.GetName() + if ret != nvml.SUCCESS { + errMsg := nvml.ErrorString(ret) + klog.ErrorS(fmt.Errorf(errMsg), "Failed to get device name", + "uuid", UUID, "index", idx) + errorCount++ + continue } + + // Check device health status health := true for _, val := range devs { if strings.Compare(val.ID, UUID) == 0 { - // when NVIDIA-Tesla P4, the device info is : ID:GPU-e290caca-2f0c-9582-acab-67a142b61ffa,Health:Healthy,Topology:nil, - // it is more reasonable to think of healthy as case-insensitive - if strings.EqualFold(val.Health, "healthy") { - health = true - } else { - health = false + health = strings.EqualFold(val.Health, "healthy") + if !health { + klog.Warning("Device is not healthy", + "uuid", UUID, "index", idx, + "healthStatus", val.Health) } break } } + + // Get NUMA affinity information numa, err := plugin.getNumaInformation(idx) if err != nil { - klog.ErrorS(err, "failed to get numa information", "idx", idx) + klog.ErrorS(err, "Failed to get NUMA information", + "uuid", UUID, "index", idx) } + + // Log successful device collection + klog.InfoS("Successfully collected GPU device info", + "uuid", UUID, + "index", idx, + "model", Model, + "memoryMB", registeredmem, + "numaNode", numa, + "healthStatus", health) + + // Add device info to result res = append(res, &util.DeviceInfo{ ID: UUID, Index: uint(idx), @@ -170,26 +220,54 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { Health: health, }) } + + // Log summary of device collection + if errorCount > 0 { + klog.Warning("Failed to collect some GPU device information", + "errorCount", errorCount, + "totalDevices", len(devs), + "successfulDevices", len(res)) + } else { + klog.InfoS("Successfully collected all GPU device information", + "deviceCount", len(res)) + } + return &res } func (plugin *NvidiaDevicePlugin) RegistrInAnnotation() error { devices := plugin.getAPIDevices() - klog.InfoS("start working on the devices", "devices", devices) + klog.Infof("Starting to register %d devices in node annotation", len(*devices)) + + if len(*devices) == 0 { + klog.Warning("No GPU devices found to register") + return nil + } + for i, dev := range *devices { + klog.InfoS("Device details", + "index", i, + "uuid", dev.ID, + "type", dev.Type, + "memoryMB", dev.Devmem, + "numaNode", dev.Numa, + "health", dev.Health) + } annos := make(map[string]string) node, err := util.GetNode(util.NodeName) if err != nil { klog.Errorln("get node error", err.Error()) return err } - encodeddevices := util.EncodeNodeDevices(*devices) + encodedDevices := util.EncodeNodeDevices(*devices) annos[nvidia.HandshakeAnnos] = "Reported " + time.Now().String() - annos[nvidia.RegisterAnnos] = encodeddevices + annos[nvidia.RegisterAnnos] = encodedDevices err = util.PatchNodeAnnotations(node, annos) - if err != nil { klog.Errorln("patch node error", err.Error()) } + klog.InfoS("Successfully registered devices in node annotation", + "deviceCount", len(*devices), + "nodeName", util.NodeName) return err } diff --git a/pkg/nvidia-plugin/pkg/plugin/server.go b/pkg/nvidia-plugin/pkg/plugin/server.go index 94b3cfd14..5e7acc334 100644 --- a/pkg/nvidia-plugin/pkg/plugin/server.go +++ b/pkg/nvidia-plugin/pkg/plugin/server.go @@ -358,7 +358,13 @@ func (plugin *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *plugi // ListAndWatch lists devices and update that list according to the health status func (plugin *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { - if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil { + devices := plugin.apiDevices() + klog.Infof("Listing devices for resource %s, count: %d", plugin.rm.Resource(), len(devices)) + + for _, dev := range devices { + klog.V(5).Infof("Device ID: %s, Health: %s", dev.ID, dev.Health) + } + if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: devices}); err != nil { return err } @@ -397,14 +403,12 @@ func (plugin *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r // Allocate which return list of devices. func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { - // 日志:函数开始 klog.InfoS("Allocate function started", "request", reqs) responses := pluginapi.AllocateResponse{} nodeName := os.Getenv(util.NodeNameEnvName) klog.InfoS("Processing allocate request on node", "nodeName", nodeName) - // 获取当前待处理的 Pod 信息 current, err := util.GetPendingPod(ctx, nodeName) if err != nil { klog.ErrorS(err, "Failed to get pending pod", "nodeName", nodeName) @@ -416,11 +420,6 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. containerIndex := idx + 1 klog.InfoS("Processing container request", "containerIndex", containerIndex, "totalContainers", len(reqs.ContainerRequests), "namespace", current.Namespace, "podName", current.Name) - //if err := plugin.rm.ValidateRequest(req.DevicesIDs); err != nil { - // klog.ErrorS(err, "Invalid allocation request", "resource", plugin.rm.Resource(), "devicesIDs", req.DevicesIDs, "namespace", current.Namespace, "podName", current.Name) - // return nil, fmt.Errorf("invalid allocation request for %q: %w", plugin.rm.Resource(), err) - //} - currentCtr, devreq, err := GetNextDeviceRequest(nvidia.NvidiaGPUDevice, *current) if err != nil { klog.ErrorS(err, "Failed to get next device request", "nodeName", nodeName, "namespace", current.Namespace, "podName", current.Name) @@ -490,7 +489,6 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. }, ) - // 检查 CUDA_DISABLE_CONTROL 环境变量是否存在 found := false for _, val := range currentCtr.Env { if strings.Compare(val.Name, "CUDA_DISABLE_CONTROL") == 0 { @@ -509,7 +507,6 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. }) } - // 检查许可证文件是否存在 _, err = os.Stat(fmt.Sprintf("%s/vgpu/license", hostHookPath)) if err == nil { response.Mounts = append(response.Mounts, &pluginapi.Mount{ @@ -525,15 +522,12 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. } } - // 将容器响应追加到最终响应中 klog.InfoS("Appending container response", "containerIndex", containerIndex, "totalContainers", len(reqs.ContainerRequests), "namespace", current.Namespace, "podName", current.Name) responses.ContainerResponses = append(responses.ContainerResponses, response) } - // 日志:最终分配响应 klog.InfoS("Final allocate response generated", "response", responses) - // 标记 Pod 分配成功 device.PodAllocationTrySuccess(nodeName, nvidia.NvidiaGPUDevice, NodeLockNvidia, current) klog.InfoS("Allocate function completed successfully", "response", responses) @@ -541,7 +535,15 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi. } func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*pluginapi.ContainerAllocateResponse, error) { + klog.InfoS("Start processing allocation response", "requestIds", requestIds) + if plugin.cdiHandler == nil { + return nil, fmt.Errorf("CDI handler not initialized") + } deviceIDs := plugin.deviceIDsFromAnnotatedDeviceIDs(requestIds) + klog.InfoS("Request ID conversion result", + "requestIds", requestIds, + "deviceIDs", deviceIDs, + "strategy", *plugin.config.Flags.Plugin.DeviceIDStrategy) // Create an empty response that will be updated as required below. response := &pluginapi.ContainerAllocateResponse{ @@ -549,10 +551,17 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu } if plugin.deviceListStrategies.AnyCDIEnabled() { responseID := uuid.New().String() + klog.InfoS("Processing CDI devices", + "responseID", responseID, + "deviceCount", len(deviceIDs)) if err := plugin.updateResponseForCDI(response, responseID, deviceIDs...); err != nil { return nil, fmt.Errorf("failed to get allocate response for CDI: %v", err) } + klog.InfoS("CDI response updated successfully", + "annotations", response.Annotations, + "cdiDevices", response.CDIDevices) } + // The following modifications are only made if at least one non-CDI device // list strategy is selected. if plugin.deviceListStrategies.AllCDIEnabled() { @@ -575,6 +584,10 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu if *plugin.config.Flags.MOFEDEnabled { response.Envs["NVIDIA_MOFED"] = "enabled" } + klog.InfoS("Successfully generated allocation response", + "envs", response.Envs, + "devices", response.Devices, + "mounts", response.Mounts) return response, nil }