Skip to content

Commit 812a3b6

Browse files
committed
templates/spark-rapids/mig.sh.in:
* insert additional templates, common/yarn_functions and gpu/mig_functions * MIG code is inline in the main function for mig * call additional exit handlers: pip and yarn * call additional prepare handler: pip templates/spark-rapids/spark-rapids.sh.in: * remove redundant template disclaimer * insert additional template, common/install_functions, gpu/install_functions, gpu/yarn_functions, gpu/spark_functions * remove redundant call to configure_gpu_script * call additional exit handlers: gpu_install_exit_handler, yarn_exit_handler * call additional prepare handler: prepare_gpu_install_env
1 parent 2bcdc2c commit 812a3b6

File tree

2 files changed

+70
-7
lines changed

2 files changed

+70
-7
lines changed

templates/spark-rapids/mig.sh.in

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,55 @@
1414
#
1515
[% PROCESS common/template_disclaimer %]
1616

17-
set -euxo pipefail
18-
1917
[% INSERT common/util_functions %]
2018

19+
[% INSERT common/yarn_functions %]
20+
21+
[% INSERT gpu/mig_functions %]
22+
2123
[% INSERT gpu/util_functions %]
2224

25+
set -euxo pipefail
26+
2327
function main() {
28+
if [[ "${nvsmi_works}" == "1" ]] ; then
29+
# if this is called without the MIG script then the drivers are not installed
30+
query_nvsmi
31+
local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
32+
set +e
33+
migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
34+
set -e
35+
NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
36+
37+
if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
38+
if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
39+
if (echo "${migquery_result}" | grep Enabled); then
40+
IS_MIG_ENABLED=1
41+
NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
42+
MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
43+
fetch_mig_scripts
44+
fi
45+
fi
46+
fi
47+
fi
48+
49+
# if mig is enabled drivers would have already been installed
50+
if [[ $IS_MIG_ENABLED -eq 0 ]]; then
51+
install_nvidia_gpu_driver
52+
install_cuda
53+
load_kernel_module
54+
55+
#Install GPU metrics collection in Stackdriver if needed
56+
if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
57+
install_gpu_agent
58+
# install_gpu_monitoring_agent
59+
echo 'GPU metrics agent successfully deployed.'
60+
else
61+
echo 'GPU metrics agent has not been installed.'
62+
fi
63+
configure_gpu_exclusive_mode
64+
fi
65+
2466
setup_gpu_yarn
2567

2668
echo "yarn setup complete"
@@ -33,12 +75,15 @@ function main() {
3375

3476
function exit_handler() {
3577
gpu_exit_handler
78+
pip_exit_handler
79+
yarn_exit_handler
3680
common_exit_handler
3781
return 0
3882
}
3983

4084
function prepare_to_install(){
4185
prepare_common_env
86+
prepare_pip_env
4287
prepare_gpu_env
4388
trap exit_handler EXIT
4489
}

templates/spark-rapids/spark-rapids.sh.in

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,37 +22,54 @@
2222
# For details see
2323
# github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot
2424
#
25-
[% PROCESS common/template_disclaimer %]
2625

2726
set -euxo pipefail
2827

2928
[% INSERT common/util_functions %]
30-
29+
[% INSERT common/install_functions %]
3130
[% INSERT gpu/util_functions %]
31+
[% INSERT gpu/install_functions %]
32+
[% INCLUDE gpu/yarn_functions %]
33+
[% INSERT gpu/spark_functions %]
3234

3335
function main() {
36+
install_gpu_driver_and_cuda
37+
38+
#Install GPU metrics collection in Stackdriver if needed
39+
if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
40+
# install_gpu_agent
41+
install_gpu_monitoring_agent
42+
echo 'GPU metrics agent successfully deployed.'
43+
else
44+
echo 'GPU metrics agent has not been installed.'
45+
fi
46+
configure_gpu_exclusive_mode
47+
3448
setup_gpu_yarn
3549

3650
echo "yarn setup complete"
3751

3852
if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
3953
install_spark_rapids
40-
configure_gpu_script
4154
echo "RAPIDS initialized with Spark runtime"
4255
elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
43-
# we are not currently tooled for installing dask in this action.
44-
echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
56+
echo "This action only installs spark-rapids"
57+
exit 1
4558
else
4659
echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
60+
exit 1
4761
fi
4862

4963
echo "main complete"
5064
return 0
5165
}
5266

5367
function exit_handler() {
68+
set +e
69+
gpu_install_exit_handler
5470
gpu_exit_handler
5571
pip_exit_handler
72+
yarn_exit_handler
5673
common_exit_handler
5774
return 0
5875
}
@@ -61,6 +78,7 @@ function prepare_to_install(){
6178
prepare_common_env
6279
prepare_pip_env
6380
prepare_gpu_env
81+
prepare_gpu_install_env
6482
trap exit_handler EXIT
6583
}
6684

0 commit comments

Comments
 (0)