Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions spark-rapids/spark-rapids.sh
Original file line number Diff line number Diff line change
Expand Up @@ -501,17 +501,20 @@ function install_nvidia_gpu_driver() {

elif is_rocky ; then

# Ensure the Correct Kernel Development Packages are Installed
execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*"
execute_with_retries "dnf -y -q install pciutils kernel-devel gcc"
# Install kernel development packages
execute_with_retries "dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)"

readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
execute_with_retries "dnf clean all"
configure_dkms_certs
execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms"
clear_dkms_key
execute_with_retries "dnf -y -q install cuda-toolkit"
# Download the CUDA installer run file
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 30 -o driver.run \
"https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run"

# Run the installer in silent mode
execute_with_retries "bash driver.run --silent --driver --toolkit --no-opengl-libs"

# Remove the installer file after installation to clean up
rm driver.run
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using rm driver.run could cause the script to fail if driver.run was not downloaded successfully. It's safer to use rm -f driver.run to prevent an error if the file does not exist.

Suggested change
rm driver.run
rm -f driver.run


# Load the NVIDIA kernel module
modprobe nvidia

else
Expand Down
12 changes: 6 additions & 6 deletions spark-rapids/test_spark_rapids.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def verify_spark_job_sql(self):
("STANDARD", ["w-0"], GPU_T4))
def test_spark_rapids(self, configuration, machine_suffixes, accelerator):

if self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky OS")
if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky 9")

if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in 2.0 and earlier images")
Expand Down Expand Up @@ -88,8 +88,8 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
("STANDARD", ["w-0"], GPU_T4))
def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):

if self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky OS")
if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky 9")

if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in 2.0 and earlier images")
Expand Down Expand Up @@ -118,8 +118,8 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
def test_non_default_cuda_versions(self, configuration, machine_suffixes,
accelerator, cuda_version, driver_version):

if self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky OS")
if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky 9")

if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in 2.0 and earlier images")
Expand Down