From 0c8376c5a7573ca4f9dd71ace6ab9f9e33f7712b Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Fri, 9 Jan 2026 15:55:23 -0800 Subject: [PATCH 1/2] Refactor spark-rapids.sh: Rename and enhance installation functions - Introduced `install_gpu_xgboost` and added a new function to check for existing RAPIDS JARs. - Introduced `remove_spark_rapids_jar` to clean up existing JARs before installation. - Updated the main function to ensure the correct RAPIDS version is installed, replacing any existing JARs as necessary. - Improved overall structure and readability of the script. --- spark-rapids/spark-rapids.sh | 73 ++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 235eb51b7..7b2ed21f8 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -269,8 +269,7 @@ function execute_with_retries() { return 1 } -function install_spark_rapids() { - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' +function install_gpu_xgboost() { local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' wget -nv --timeout=30 --tries=5 --retry-connrefused \ @@ -279,6 +278,29 @@ function install_spark_rapids() { wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ -P /usr/lib/spark/jars/ +} + +function check_spark_rapids_jar() { + local jars_found + jars_found=$(ls /usr/lib/spark/jars/rapids-4-spark_*.jar 2>/dev/null | wc -l) + if [[ $jars_found -gt 0 ]]; then + echo "RAPIDS Spark plugin JAR found" + return 0 + else + echo "RAPIDS Spark plugin JAR not found" + return 1 + fi +} + +function remove_spark_rapids_jar() { + rm -f /usr/lib/spark/jars/rapids-4-spark_2.12-*.jar + echo "Existing RAPIDS Spark plugin JAR removed successfully" +} + +function install_spark_rapids() { + + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ -P /usr/lib/spark/jars/ @@ -807,28 +829,39 @@ function remove_old_backports { function main() { - if is_debian && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then - remove_old_backports - fi - check_os_and_secure_boot - setup_gpu_yarn - if [[ "${RUNTIME}" == "SPARK" ]]; then + # If the RAPIDS Spark RAPIDS JAR is already installed (common on ML images), replace it with the requested version + # ML images by default have Spark RAPIDS and GPU drivers installed + if check_spark_rapids_jar; then + # This ensures the cluster always uses the desired RAPIDS version, even if a default is present + remove_spark_rapids_jar install_spark_rapids - configure_spark - echo "RAPIDS initialized with Spark runtime" + echo "RAPIDS Spark RAPIDS JAR replaced successfully" else - echo "Unsupported RAPIDS Runtime: ${RUNTIME}" - exit 1 - fi + # Install GPU drivers and setup SPARK RAPIDS JAR for non-ML images + if is_debian && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then + remove_old_backports + fi + check_os_and_secure_boot + setup_gpu_yarn + if [[ "${RUNTIME}" == "SPARK" ]]; then + install_spark_rapids + install_gpu_xgboost + configure_spark + echo "RAPIDS initialized with Spark runtime" + else + echo "Unsupported RAPIDS Runtime: ${RUNTIME}" + exit 1 + fi - for svc in resourcemanager nodemanager; do - if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-${svc}.service + for svc in resourcemanager nodemanager; do + if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-${svc}.service + fi + done + if is_debian || is_ubuntu ; then + apt-get clean fi - done - if is_debian || is_ubuntu ; then - apt-get clean fi } -main +main \ No newline at end of file From 54d7aaaaddd1a0283f4f3d321a535fc09f46677e Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Fri, 9 Jan 2026 16:24:40 -0800 Subject: [PATCH 2/2] Update remove_spark_rapids_jar function to use wildcard for JAR removal - Modified the `remove_spark_rapids_jar` function to use a wildcard for matching RAPIDS JAR files, allowing for more flexible removal of existing JARs. - Ensured the main function is properly terminated with a newline for better script formatting. --- spark-rapids/spark-rapids.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 7b2ed21f8..82f3415b1 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -293,7 +293,7 @@ function check_spark_rapids_jar() { } function remove_spark_rapids_jar() { - rm -f /usr/lib/spark/jars/rapids-4-spark_2.12-*.jar + rm -f /usr/lib/spark/jars/rapids-4-spark_*.jar echo "Existing RAPIDS Spark plugin JAR removed successfully" } @@ -864,4 +864,4 @@ function main() { fi } -main \ No newline at end of file +main