diff --git a/docker/Dockerfile.worker b/docker/Dockerfile.worker index d4d5857e..a6e30c70 100644 --- a/docker/Dockerfile.worker +++ b/docker/Dockerfile.worker @@ -2,6 +2,7 @@ ARG PYTHON_VERSION=3.13 FROM python:$PYTHON_VERSION-slim-bookworm AS base RUN apt-get update && apt-get install -y --no-install-recommends \ + tini \ openjdk-17-jdk-headless \ # required for HDFS/Hive with Kerberos enabled krb5-user \ diff --git a/docker/entrypoint_worker.sh b/docker/entrypoint_worker.sh index 90a61ec4..2cf8d7e1 100755 --- a/docker/entrypoint_worker.sh +++ b/docker/entrypoint_worker.sh @@ -1,8 +1,101 @@ #!/usr/bin/env bash -set -e -# https://docs.celeryq.dev/en/stable/userguide/workers.html#max-tasks-per-child-setting -# Required to start each Celery task in separated process, avoiding issues with global Spark session object +# Based on https://github.com/apache/spark/blob/v3.5.7/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh -# exec is required to forward all signals to the main process -exec python -m celery -A syncmaster.worker.celery worker --max-tasks-per-child=1 "$@" +set -ex + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi +if [ -z "$SPARK_HOME" ]; then + SPARK_HOME=$(python -c 'import pyspark; import pathlib; print(pathlib.Path(pyspark.__file__).parent.resolve())') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" + +env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > java_opts.txt +if [ "$(command -v readarray)" ]; then + readarray -t SPARK_EXECUTOR_JAVA_OPTS < java_opts.txt +else + SPARK_EXECUTOR_JAVA_OPTS=("${(@f)$(< java_opts.txt)}") +fi + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +if ! [ -z ${PYSPARK_PYTHON+x} ]; then + export PYSPARK_PYTHON +fi +if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then + export PYSPARK_DRIVER_PYTHON +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z ${HADOOP_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z ${SPARK_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z ${SPARK_HOME+x} ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +# Place IVY2 jars at the end of classpath to avoid conflicts with Spark and Hadoop jars +IVY2_HOME=$(realpath ~/.ivy2) +SPARK_CLASSPATH="$SPARK_CLASSPATH:${IVY2_HOME}/jars/*" + +# SPARK-43540: add current working directory into executor classpath +SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD" + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms$SPARK_EXECUTOR_MEMORY + -Xmx$SPARK_EXECUTOR_MEMORY + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend + --driver-url $SPARK_DRIVER_URL + --executor-id $SPARK_EXECUTOR_ID + --cores $SPARK_EXECUTOR_CORES + --app-id $SPARK_APPLICATION_ID + --hostname $SPARK_EXECUTOR_POD_IP + --resourceProfileId $SPARK_RESOURCE_PROFILE_ID + --podName $SPARK_EXECUTOR_POD_NAME + ) + ;; + + *) + # https://docs.celeryq.dev/en/stable/userguide/workers.html#max-tasks-per-child-setting + # Required to start each Celery task in separated process, avoiding issues with global Spark session object + CMD=( + python -m celery + -A syncmaster.worker.celery + worker + --max-tasks-per-child=1 + "$@" + ) + ;; +esac + +# Execute the container CMD under tini for better hygiene +exec /usr/bin/tini -s -- "${CMD[@]}" diff --git a/docs/changelog/next_release/295.feature.rst b/docs/changelog/next_release/295.feature.rst new file mode 100644 index 00000000..e37b1f9b --- /dev/null +++ b/docs/changelog/next_release/295.feature.rst @@ -0,0 +1 @@ +Allow using SyncMaster worker image as ``spark.kubernetes.container.image``. diff --git a/docs/reference/worker/create_spark_session.rst b/docs/reference/worker/create_spark_session.rst index b6383a19..41ee0b45 100644 --- a/docs/reference/worker/create_spark_session.rst +++ b/docs/reference/worker/create_spark_session.rst @@ -23,6 +23,20 @@ It is possible to alter default `Spark Session configuration