GoogleCloudDataproc
diff --git a/‎cloudbuild/presubmit.sh‎
Lines changed: 1 addition & 0 deletions b/‎cloudbuild/presubmit.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cloudbuild/run-presubmit-on-k8s.sh‎
Lines changed: 13 additions & 3 deletions b/‎cloudbuild/run-presubmit-on-k8s.sh‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎gpu/README.md‎
Lines changed: 2 additions & 2 deletions b/‎gpu/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gpu/gpu_test_case_base.py‎
Lines changed: 136 additions & 0 deletions b/‎gpu/gpu_test_case_base.py‎
Lines changed: 136 additions & 0 deletions
@@ -70,6 +70,7 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
+      continue # remove this line before submission
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0
 
@@ -66,11 +66,21 @@ kubectl wait --for=condition=Ready "pod/${POD_NAME}" --timeout=15m
 
 # To mitigate problems with early test failure, retry kubectl logs
 sleep 10s
-while ! kubectl describe "pod/${POD_NAME}" | grep -q Terminated; do
-  # Try to stream logs, but primary log capture is now in the trap
+while true; do
+  if ! kubectl describe "pod/${POD_NAME}" > /dev/null 2>&1; then
+    echo "Pod ${POD_NAME} not found, assuming it has been deleted."
+    break # Exit the loop if the pod doesn't exist
+  fi
+
+  if kubectl describe "pod/${POD_NAME}" | grep -q Terminated; then
+    echo "Pod ${POD_NAME} is Terminated."
+    break # Exit the loop if the pod is Terminated
+  fi
+
+  # Try to stream logs
   kubectl logs -f "${POD_NAME}" --since-time="${LOGS_SINCE_TIME}" --timestamps=true || true
   LOGS_SINCE_TIME=$(date --iso-8601=seconds)
-  sleep 2 # Short sleep to avoid busy waiting if logs -f exits
+  sleep 2
 done
 
 # Final check on the pod exit code
 
@@ -26,7 +26,7 @@ Refer to internal arrays in `install_gpu_driver.sh` for the full matrix.)*
 
 CUDA | Full Version | Driver    | cuDNN     | NCCL   | Tested Dataproc Image Versions
 -----| ------------ | --------- | --------- | -------| ---------------------------
-11.8 | 11.8.0       | 525.147.05| 9.5.1.17  | 2.21.5 | 2.0, 2.1 (Debian/Ubuntu/Rocky)
+11.8 | 11.8.0       | 525.147.05| 9.5.1.17  | 2.21.5 | 2.0, 2.1 (Debian/Ubuntu/Rocky); 2.2 (Ubuntu 22.04)
 12.0 | 12.0.1       | 525.147.05| 8.8.1.3   | 2.16.5 | 2.0, 2.1 (Debian/Ubuntu/Rocky); 2.2 (Rocky 9, Ubuntu 22.04)
 12.4 | 12.4.1       | 550.135   | 9.1.0.70  | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+
 12.6 | 12.6.3       | 550.142   | 9.6.0.74  | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+
@@ -324,4 +324,4 @@ handles metric creation and reporting.
     Debian-based systems, including handling of archived backports repositories
     to ensure dependencies can be met.
   * Tested primarily with Dataproc 2.0+ images. Support for older Dataproc
-    1.5 images is limited.
+    1.5 images is limited.
@@ -0,0 +1,136 @@
+import os
+import time
+import random
+from packaging import version
+from integration_tests.dataproc_test_case import DataprocTestCase
+
+DEFAULT_TIMEOUT = 45  # minutes
+
+class GpuTestCaseBase(DataprocTestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def run_dataproc_job(self,
+                         cluster_name,
+                         job_type,
+                         job_params,
+                         timeout_in_minutes=DEFAULT_TIMEOUT):
+        """Executes Dataproc job on a cluster and returns results.
+
+        Args:
+            cluster_name: cluster name to submit job to
+            job_type: type of the job, e.g. spark, hadoop, pyspark
+            job_params: job parameters
+            timeout_in_minutes: timeout in minutes
+
+        Returns:
+            ret_code: the return code of the job
+            stdout: standard output of the job
+            stderr: error output of the job
+        """
+
+        ret_code, stdout, stderr = DataprocTestCase.run_command(
+            'gcloud dataproc jobs submit {} --cluster={} --region={} {}'.
+            format(job_type, cluster_name, self.REGION,
+                   job_params), timeout_in_minutes)
+        return ret_code, stdout, stderr
+
+    # Tests for PyTorch
+    TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
+
+    # Tests for TensorFlow
+    TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
+
+    def assert_instance_command(self,
+                             instance,
+                             cmd,
+                             timeout_in_minutes=DEFAULT_TIMEOUT):
+        retry_count = 5
+        ssh_cmd = 'gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60 -o StrictHostKeyChecking=no'.format(
+            instance, self.cluster_zone, cmd.replace('"', '\"'))
+
+        while retry_count > 0:
+            try:
+                # Use self.assert_command from DataprocTestCase
+                ret_code, stdout, stderr = self.assert_command(ssh_cmd, timeout_in_minutes)
+                return ret_code, stdout, stderr
+            except Exception as e:
+                print(f"An error occurred in assert_instance_command: {e}")
+                retry_count -= 1
+                if retry_count > 0:
+                    print(f"Retrying in 10 seconds...")
+                    time.sleep(10)
+                    continue
+                else:
+                    print("Max retries reached.")
+                    raise
+
+    def verify_instance(self, name):
+        # Verify that nvidia-smi works
+        self.assert_instance_command(name, "nvidia-smi", 1)
+        print(f"OK: nvidia-smi on {name}")
+
+    def verify_instance_gpu_agent(self, name):
+        print(f"--- Verifying GPU Agent on {name} ---")
+        self.assert_instance_command(
+            name, "systemctl is-active gpu-utilization-agent.service")
+        print(f"OK: GPU Agent on {name}")
+
+    def get_dataproc_image_version(self, instance):
+        _, stdout, _ = self.assert_instance_command(instance, "grep DATAPROC_IMAGE_VERSION /etc/environment | cut -d= -f2")
+        return stdout.strip()
+
+    def version_lt(self, v1, v2):
+        return version.parse(v1) < version.parse(v2)
+
+    def verify_pytorch(self, name):
+        print(f"--- Verifying PyTorch on {name} ---")
+        test_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
+                                   self.TORCH_TEST_SCRIPT_FILE_NAME)
+        self.upload_test_file(test_filename, name)
+
+        image_version = self.get_dataproc_image_version(name)
+        conda_root_path = "/opt/conda/miniconda3"
+        if not self.version_lt(image_version, "2.3"):
+            conda_root_path = "/opt/conda"
+
+        conda_env = "dpgce"
+        env_path = f"{conda_root_path}/envs/{conda_env}"
+        python_bin = f"{env_path}/bin/python3"
+
+        verify_cmd = (
+            f"for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node; do "
+            f"  if [[ -e \\\"$f\\\" ]]; then echo 0 > \\\"$f\\\"; fi; "
+            f"done; "
+            f"if /usr/share/google/get_metadata_value attributes/include-pytorch; then"
+            f"  {python_bin} {self.TORCH_TEST_SCRIPT_FILE_NAME}; "
+            f"else echo 'PyTorch test skipped as include-pytorch is not set'; fi"
+        )
+        _, stdout, _ = self.assert_instance_command(name, verify_cmd)
+        if "PyTorch test skipped" not in stdout:
+             self.assertTrue("True" in stdout, f"PyTorch CUDA not available or python not found in {env_path}")
+        print(f"OK: PyTorch on {name}")
+        self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
+
+    def verify_tensorflow(self, name):
+        print(f"--- Verifying TensorFlow on {name} ---")
+        test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
+                               self.TF_TEST_SCRIPT_FILE_NAME)
+        self.upload_test_file(test_filename, name)
+
+        image_version = self.get_dataproc_image_version(name)
+        conda_root_path = "/opt/conda/miniconda3"
+        if not self.version_lt(image_version, "2.3"):
+            conda_root_path = "/opt/conda"
+
+        conda_env="dpgce"
+        env_path = f"{conda_root_path}/envs/{conda_env}"
+        python_bin = f"{env_path}/bin/python3"
+
+        verify_cmd = (
+            f"for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${{f}} ; done ;"
+            f"{python_bin} {self.TF_TEST_SCRIPT_FILE_NAME}"
+        )
+        self.assert_instance_command(name, verify_cmd)
+        print(f"OK: TensorFlow on {name}")
+        self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)