diff --git a/nvidia-driver-installer/centos/Dockerfile b/nvidia-driver-installer/centos/Dockerfile new file mode 100644 index 000000000..9fefa3d06 --- /dev/null +++ b/nvidia-driver-installer/centos/Dockerfile @@ -0,0 +1,28 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM centos:7 + +RUN yum update -y && \ + yum install -y gcc make && \ + yum clean all && \ + rm -rf /var/cache/yum + +#RUN apt-get update && \ +# apt-get install -y kmod gcc make curl && \ +# rm -rf /var/lib/apt/lists/* + +COPY entrypoint.sh /entrypoint.sh + +CMD /entrypoint.sh diff --git a/nvidia-driver-installer/centos/Makefile b/nvidia-driver-installer/centos/Makefile new file mode 100644 index 000000000..1de58ed23 --- /dev/null +++ b/nvidia-driver-installer/centos/Makefile @@ -0,0 +1,25 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TAG?=latest +REGISTRY?=gcr.io/google-containers +IMAGE=centos-nvidia-driver-installer + +container: + docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} . + +push: + gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG} + +.PHONY: container push diff --git a/nvidia-driver-installer/centos/daemonset.yaml b/nvidia-driver-installer/centos/daemonset.yaml new file mode 100644 index 000000000..472e64ae1 --- /dev/null +++ b/nvidia-driver-installer/centos/daemonset.yaml @@ -0,0 +1,79 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-driver-installer + namespace: kube-system + labels: + k8s-app: nvidia-driver-installer +spec: + selector: + matchLabels: + k8s-app: nvidia-driver-installer + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-driver-installer + k8s-app: nvidia-driver-installer + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists + tolerations: + - key: "nvidia.com/gpu" + effect: "NoSchedule" + operator: "Exists" + volumes: + - name: dev + hostPath: + path: /dev + - name: nvidia-install-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: root-mount + hostPath: + path: / + initContainers: + - image: gcr.io/google-containers/centos-nvidia-driver-installer@sha256:to-be-build + name: nvidia-driver-installer + resources: + requests: + cpu: 0.15 + securityContext: + privileged: true + env: + - name: NVIDIA_INSTALL_DIR_HOST + value: /home/kubernetes/bin/nvidia + - name: NVIDIA_INSTALL_DIR_CONTAINER + value: /usr/local/nvidia + - name: ROOT_MOUNT_DIR + value: /root + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: dev + mountPath: /dev + - name: root-mount + mountPath: /root + containers: + - image: "gcr.io/google-containers/pause:2.0" + name: pause diff --git a/nvidia-driver-installer/centos/entrypoint.sh b/nvidia-driver-installer/centos/entrypoint.sh new file mode 100755 index 000000000..2663ea9d7 --- /dev/null +++ b/nvidia-driver-installer/centos/entrypoint.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -u + +set -x +NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-384.111}" +NVIDIA_DRIVER_DOWNLOAD_URL_DEFAULT="https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run" +NVIDIA_DRIVER_DOWNLOAD_URL="${NVIDIA_DRIVER_DOWNLOAD_URL:-$NVIDIA_DRIVER_DOWNLOAD_URL_DEFAULT}" +NVIDIA_INSTALL_DIR_HOST="${NVIDIA_INSTALL_DIR_HOST:-/var/lib/nvidia}" +NVIDIA_INSTALL_DIR_CONTAINER="${NVIDIA_INSTALL_DIR_CONTAINER:-/usr/local/nvidia}" +NVIDIA_INSTALLER_RUNFILE="$(basename "${NVIDIA_DRIVER_DOWNLOAD_URL}")" +ROOT_MOUNT_DIR="${ROOT_MOUNT_DIR:-/root}" +CACHE_FILE="${NVIDIA_INSTALL_DIR_CONTAINER}/.cache" +KERNEL_VERSION="$(uname -r)" +set +x + +check_cached_version() { + echo "Checking cached version" + if [[ ! -f "${CACHE_FILE}" ]]; then + echo "Cache file ${CACHE_FILE} not found." + return 1 + fi + + # Source the cache file and check if the cached driver matches + # currently running kernel version and requested driver versions. + . "${CACHE_FILE}" + if [[ "${KERNEL_VERSION}" == "${CACHE_KERNEL_VERSION}" ]]; then + if [[ "${NVIDIA_DRIVER_VERSION}" == "${CACHE_NVIDIA_DRIVER_VERSION}" ]]; then + echo "Found existing driver installation for kernel version ${KERNEL_VERSION} and driver version ${NVIDIA_DRIVER_VERSION}." + return 0 + fi + fi + echo "Cache file ${CACHE_FILE} found but existing versions didn't match." + return 1 +} + +update_cached_version() { + cat >"${CACHE_FILE}"<<__EOF__ +CACHE_KERNEL_VERSION=${KERNEL_VERSION} +CACHE_NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION} +__EOF__ + + echo "Updated cached version as:" + cat "${CACHE_FILE}" +} + +update_container_ld_cache() { + echo "Updating container's ld cache..." + echo "${NVIDIA_INSTALL_DIR_CONTAINER}/lib64" > /etc/ld.so.conf.d/nvidia.conf + ldconfig + echo "Updating container's ld cache... DONE." +} + +download_kernel_src() { + echo "Downloading kernel sources..." + # apt-get update && apt-get install -y linux-headers-${KERNEL_VERSION} + yum update -y && yum install -y kernel-devel-${KERNEL_VERSION} + echo "Downloading kernel sources... DONE." +} + +configure_nvidia_installation_dirs() { + echo "Configuring installation directories..." + mkdir -p "${NVIDIA_INSTALL_DIR_CONTAINER}" + pushd "${NVIDIA_INSTALL_DIR_CONTAINER}" + + # nvidia-installer does not provide an option to configure the + # installation path of `nvidia-modprobe` utility and always installs it + # under /usr/bin. The following workaround ensures that + # `nvidia-modprobe` is accessible outside the installer container + # filesystem. + mkdir -p bin bin-workdir + mount -t overlay -o lowerdir=/usr/bin,upperdir=bin,workdir=bin-workdir none /usr/bin + + # nvidia-installer does not provide an option to configure the + # installation path of libraries such as libnvidia-ml.so. The following + # workaround ensures that the libs are accessible from outside the + # installer container filesystem. + mkdir -p lib64 lib64-workdir + mkdir -p /usr/lib/x86_64-linux-gnu + mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=lib64-workdir none /usr/lib/x86_64-linux-gnu + + # nvidia-installer does not provide an option to configure the + # installation path of driver kernel modules such as nvidia.ko. The following + # workaround ensures that the modules are accessible from outside the + # installer container filesystem. + mkdir -p drivers drivers-workdir + mkdir -p /lib/modules/${KERNEL_VERSION}/video + mount -t overlay -o lowerdir=/lib/modules/${KERNEL_VERSION}/video,upperdir=drivers,workdir=drivers-workdir none /lib/modules/${KERNEL_VERSION}/video + + # Populate ld.so.conf to avoid warning messages in nvidia-installer logs. + update_container_ld_cache + + # Install an exit handler to cleanup the overlayfs mount points. + # /usr/bin is unmounted in a lazy way since /bin is a symlink to /usr/bin in centos + # this causes /usr/bin/bash to be always in use and be ununmountable. + trap "{ umount /lib/modules/${KERNEL_VERSION}/video; umount /usr/lib/x86_64-linux-gnu ; umount -l /usr/bin; }" EXIT + popd + echo "Configuring installation directories... DONE." +} + +download_nvidia_installer() { + echo "Downloading Nvidia installer..." + pushd "${NVIDIA_INSTALL_DIR_CONTAINER}" + curl -L -S -f "${NVIDIA_DRIVER_DOWNLOAD_URL}" -o "${NVIDIA_INSTALLER_RUNFILE}" + popd + echo "Downloading Nvidia installer... DONE." +} + +run_nvidia_installer() { + echo "Running Nvidia installer..." + pushd "${NVIDIA_INSTALL_DIR_CONTAINER}" + sh "${NVIDIA_INSTALLER_RUNFILE}" \ + --utility-prefix="${NVIDIA_INSTALL_DIR_CONTAINER}" \ + --opengl-prefix="${NVIDIA_INSTALL_DIR_CONTAINER}" \ + --no-install-compat32-libs \ + --log-file-name="${NVIDIA_INSTALL_DIR_CONTAINER}/nvidia-installer.log" \ + --kernel-source-path=/usr/src/kernels/${KERNEL_VERSION} \ + --no-drm \ + --silent \ + --accept-license + popd + echo "Running Nvidia installer... DONE." +} + +configure_cached_installation() { + echo "Configuring cached driver installation..." + update_container_ld_cache + if ! lsmod | grep -q -w 'nvidia'; then + insmod "${NVIDIA_INSTALL_DIR_CONTAINER}/drivers/nvidia.ko" + fi + if ! lsmod | grep -q -w 'nvidia_uvm'; then + insmod "${NVIDIA_INSTALL_DIR_CONTAINER}/drivers/nvidia-uvm.ko" + fi + echo "Configuring cached driver installation... DONE" +} + +verify_nvidia_installation() { + echo "Verifying Nvidia installation..." + export PATH="${NVIDIA_INSTALL_DIR_CONTAINER}/bin:${PATH}" + nvidia-smi + # Create unified memory device file. + nvidia-modprobe -c0 -u + echo "Verifying Nvidia installation... DONE." +} + +update_host_ld_cache() { + echo "Updating host's ld cache..." + echo "${NVIDIA_INSTALL_DIR_HOST}/lib64" >> "${ROOT_MOUNT_DIR}/etc/ld.so.conf" + ldconfig -r "${ROOT_MOUNT_DIR}" + echo "Updating host's ld cache... DONE." +} + +main() { + if check_cached_version; then + configure_cached_installation + verify_nvidia_installation + else + download_kernel_src + configure_nvidia_installation_dirs + download_nvidia_installer + run_nvidia_installer + update_cached_version + verify_nvidia_installation + fi + update_host_ld_cache +} + +main "$@"