diff --git a/runtimeconfig/docker/Dockerfile b/runtimeconfig/docker/Dockerfile index 809c10cb473994be9aa5ccbe707f9570b15dca48..56b6cd25285ecf1c53854e20a8c658114f7f5f8c 100644 --- a/runtimeconfig/docker/Dockerfile +++ b/runtimeconfig/docker/Dockerfile @@ -6,24 +6,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ gnupg2 \ jq \ - moreutils \ software-properties-common -RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \ - curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - && \ +RUN curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - && \ curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | \ - tee /etc/apt/sources.list.d/nvidia-docker.list && \ - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" + tee /etc/apt/sources.list.d/nvidia-docker.list RUN apt-get update && apt-get install -y --no-install-recommends \ - docker-ce-cli \ nvidia-container-runtime WORKDIR /work COPY src/. . -RUN cp /etc/nvidia-container-runtime/config.toml ./ RUN chmod +x /work/run.sh diff --git a/runtimeconfig/src/common.sh b/runtimeconfig/src/common.sh index 9697f10e1e314263ea33d54346a76afc28e22653..35ddef9a3c4cf8aaa5b389ec520378d2f6507847 100644 --- a/runtimeconfig/src/common.sh +++ b/runtimeconfig/src/common.sh @@ -27,3 +27,30 @@ log() { printf "%s[%s]%s %b\n" "${fmt_on}" "${level}" "${fmt_off}" "${message}" >&2 } +with_retry() { + local max_attempts="$1" + local delay="$2" + local count=0 + local rc + shift 2 + + while true; do + set +e + "$@"; rc="$?" + set -e + + count="$((count+1))" + + if [[ "${rc}" -eq 0 ]]; then + return 0 + fi + + if [[ "${max_attempts}" -le 0 ]] || [[ "${count}" -lt "${max_attempts}" ]]; then + sleep "${delay}" + else + break + fi + done + + return 1 +} diff --git a/runtimeconfig/src/docker.sh b/runtimeconfig/src/docker.sh index c84c3ed948891e45394ac96b72d3800e8c0e7470..3d0596e3050f392c5c2c70f352b1deecb6f1eb14 100644 --- a/runtimeconfig/src/docker.sh +++ b/runtimeconfig/src/docker.sh @@ -1,75 +1,98 @@ #! /bin/bash # Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. +readonly DOCKER_CONFIG="/etc/docker/daemon.json" + docker::info() { - local -r docker_socket="${1:-unix:///var/run/docker.socket}" + local -r docker_socket="${1:-/var/run/docker.sock}" + curl --unix-socket "${docker_socket}" 'http://v1.40/info' | jq '.Runtimes.nvidia.path' +} - # Docker in Docker has a startup race - for i in $(seq 1 5); do - # Calling in a subshell so that we can recover from a failure - if [[ ! $(docker -H "${docker_socket}" info -f '{{json .Runtimes}}') ]]; then - sleep 2 - continue - fi +docker::ensure::mounted() { + mount | grep /etc/docker + if [[ ! $? ]]; then + log ERROR "Docker directory isn't mounted in container" + log ERROR "Ensure that you have correctly mounted the docker directoy" + exit 1 + fi +} + +docker::ensure::config_dir() { + # Ensure that the docker config path exists + if [[ ! -d "/etc/docker" ]]; then + log ERROR "Docker directory doesn't exist in container" + log ERROR "Ensure that you have correctly mounted the docker directoy" + exit 1 + fi - docker -H "${docker_socket}" info -f '{{json .Runtimes}}' | jq '.nvidia.path' - return - done - exit 1 } -# Echo an empty config if the config file doesn't exist -docker::daemon_config() { - local -r daemon_file="${1:-"/etc/docker/daemon.json"}" - ([[ -f "${daemon_file}" ]] && cat "${daemon_file}") || echo {} +docker::config::backup() { + if [[ -f "${DOCKER_CONFIG}" ]]; then + mv "${DOCKER_CONFIG}" "${DOCKER_CONFIG}.bak" + fi } -docker::refresh_configuration() { - log INFO "Refreshing the docker daemon configuration" - pkill -SIGHUP dockerd +docker::config::restore() { + if [[ -f "${DOCKER_CONFIG}" ]]; then + mv "${DOCKER_CONFIG}.bak" "${DOCKER_CONFIG}" + fi } -docker::update_config_file() { +docker::config::add_runtime() { local -r destination="${1:-/run/nvidia}" local -r nvcr="${destination}/nvidia-container-runtime" - local config_json - IFS='' read -r config_json - - echo "${config_json}" | \ + cat - | \ + jq -r ".runtimes = {}" | \ jq -r ".runtimes += {\"nvidia\": {\"path\": \"${nvcr}\"}}" | \ jq -r '. += {"default-runtime": "nvidia"}' } -docker::ensure_prerequisites() { - # Ensure that the docker config path exists - if [[ ! -d "/etc/docker" ]]; then - log ERROR "Docker directory doesn't exist in container" - log ERROR "Ensure that you have correctly mounted the docker directoy" - exit 1 - fi +docker::config() { + ([[ -f "${DOCKER_CONFIG}" ]] && cat "${DOCKER_CONFIG}") || echo {} +} - mount | grep /etc/docker - if [[ ! $? ]]; then - log ERROR "Docker directory isn't mounted in container" - log ERROR "Ensure that you have correctly mounted the docker directoy" - exit 1 - fi +docker::config::refresh() { + log INFO "Refreshing the docker daemon configuration" + pkill -SIGHUP dockerd +} + +docker::config::get_nvidia_runtime() { + cat - | jq -r '.runtimes | keys[0]' } docker::setup() { - local -r destination="${1:-/run/nvidia}" + docker::ensure::mounted + docker::ensure::config_dir + log INFO "Setting up the configuration for the docker daemon" - docker::ensure_prerequisites + local -r destination="${1:-/run/nvidia}" + local -r docker_socket="${2:-"/var/run/docker.socket"}" + + local config=$(docker::config) + + log INFO "current config: ${config}" - log INFO "current config: $(docker::daemon_config)" + local -r nvidia_runtime="$(with_retry 5 2s docker::info "${docker_socket}")" + + if [[ "${nvidia_runtime}" = "${destination}/nvidia-container-runtime" ]]; then + return + fi # Append the nvidia runtime to the docker daemon's configuration - # We use sponge here because the input file is the output file - config=$(docker::daemon_config | docker::update_config_file "${destination}") - echo "${config}" > /etc/docker/daemon.json + local updated_config=$(echo "${config}" | docker::config::add_runtime "${destination}") + local -r config_runtime=$(echo "${updated_config}" | docker::config::get_nvidia_runtime) + + # If there was an error while parsing the file catch it here + if [[ "${config_runtime}" != "nvidia" ]]; then + config=$(echo "{}" | docker::config::add_runtime "${destination}") + fi + + docker::config::backup + echo "${updated_config}" > /etc/docker/daemon.json - log INFO "after: $(docker::daemon_config | jq .)" - docker::refresh_configuration + log INFO "after: $(docker::config | jq .)" + docker::config::refresh } diff --git a/runtimeconfig/src/run.sh b/runtimeconfig/src/run.sh index f59506688a7eff2625fd3951d821ca1015aecdbf..f907fa214068e8c3d598ff86053ec79c6df107f4 100755 --- a/runtimeconfig/src/run.sh +++ b/runtimeconfig/src/run.sh @@ -4,80 +4,33 @@ set -euxo pipefail shopt -s lastpipe -source "common.sh" -source "docker.sh" +readonly RUN_DIR="/run/nvidia" +readonly TOOLKIT_DIR="${RUN_DIR}/toolkit" -install_nvidia_container_runtime_toolkit() { - log INFO "Installing the NVIDIA Container Runtime Toolkit" +readonly basedir="$(dirname "$(realpath "$0")")" - local -r destination="${1:-/run/nvidia}" - local -a packages=("/usr/bin/nvidia-container-runtime" \ - "/usr/bin/nvidia-container-toolkit" \ - "/usr/bin/nvidia-container-cli" \ - "/etc/nvidia-container-runtime/config.toml" \ - "/usr/lib/x86_64-linux-gnu/libnvidia-container.so.1") - - # TODO workaround until we fix the runtime requiring this - # directory and file to exist at that location - cp ./config.toml /etc/nvidia-container-runtime - - # Bash variables starts at 0 - # ZSH variables starts at 1 - for ((i=0; i < ${#packages[@]}; i++)); do - packages[$i]=$(readlink -f ${packages[$i]}) - done - - if [[ ! -d "${destination}" ]]; then - log ERROR "Destination directory doesn't exist in container" - log ERROR "Ensure that you have correctly mounted the destination directoy" - exit 1 - fi - - cp "${packages[@]}" "${destination}" - - # Setup links to the real binaries to ensure that variables and configs - # are pointing to the right path - mv "${destination}/nvidia-container-toolkit" \ - "${destination}/nvidia-container-toolkit.real" - mv "${destination}/nvidia-container-runtime" \ - "${destination}/nvidia-container-runtime.real" - - - # Setup aliases so as to ensure that the path is correctly set - cat <<- EOF > ${destination}/nvidia-container-toolkit - #! /bin/sh - LD_LIBRARY_PATH="${destination}" \ - PATH="\$PATH:${destination}" \ - ${destination}/nvidia-container-toolkit.real \ - -config "${destination}/config.toml" \ - \$@ - EOF - - cat <<- EOF > ${destination}/nvidia-container-runtime - #! /bin/sh - LD_LIBRARY_PATH="${destination}" \ - PATH="\$PATH:${destination}" \ - ${destination}/nvidia-container-runtime.real \ - \$@ - EOF - - # Make sure that the alias files are executable - chmod +x "${destination}/nvidia-container-toolkit" - chmod +x "${destination}/nvidia-container-runtime" -} +source "${basedir}/common.sh" +source "${basedir}/toolkit.sh" +source "${basedir}/docker.sh" main() { - local -r destination="${1:-/run/nvidia}" - local -r docker_socket="${2:-/var/run/docker.socket}" - local -r nvidia_runtime="$(docker::info ${docker_socket})" - - if [[ "${nvidia_runtime}" = "${destination}/nvidia-container-runtime" ]]; then - exit 0 - fi - - install_nvidia_container_runtime_toolkit "${destination}" - docker::setup "${destination}" - echo "docker info: $(docker::info ${docker_socket})" + local -r destination="${1:-"${RUN_DIR}"}/toolkit" + local -r docker_socket="${2:-"/var/run/docker.socket"}" + + toolkit::setup "${destination}" + docker::setup "${destination}" "${docker_socket}" + echo "docker info: $(docker::info "${docker_socket}")" + + echo "Done, now waiting for signal" + sleep infinity & + + # shellcheck disable=SC2064 + # We want the expand to happen now rather than at trap time + # Setup a new signal handler and reset the EXIT signal handler + trap "echo 'Caught signal'; toolkit::uninstall && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 } main "$@" diff --git a/runtimeconfig/src/toolkit.sh b/runtimeconfig/src/toolkit.sh new file mode 100644 index 0000000000000000000000000000000000000000..c6cad26c3805df2a6c5cb467fefd45b9a4796a5a --- /dev/null +++ b/runtimeconfig/src/toolkit.sh @@ -0,0 +1,132 @@ +#! /bin/bash +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + +packages=("/usr/bin/nvidia-container-runtime" \ + "/usr/bin/nvidia-container-toolkit" \ + "/usr/bin/nvidia-container-cli" \ + "/etc/nvidia-container-runtime/config.toml" \ + "/usr/lib/x86_64-linux-gnu/libnvidia-container.so.1") + +toolkit::install() { + local -r destination="${1:-"${TOOLKIT_DIR}"}" + log INFO "${FUNCNAME[0]} $*" + + mkdir -p "/nvidia" "${destination}" + mount --rbind "/nvidia" "${destination}" + + mkdir -p "${destination}" + mkdir -p "${destination}/.config/nvidia-container-runtime" + + # Note: Bash arrays start at 0 (zsh arrays start at 1) + for ((i=0; i < ${#packages[@]}; i++)); do + packages[$i]=$(readlink -f "${packages[$i]}") + done + + cp "${packages[@]}" "${destination}" + mv "${destination}/config.toml" "${destination}/.config/nvidia-container-runtime/" +} + +toolkit::uninstall() { + local -r destination="${1:-"${TOOLKIT_DIR}"}" + log INFO "${FUNCNAME[0]} $*" + + if findmnt -r -o TARGET | grep "${destination}" > /dev/null; then + umount -l -R "${destination}" + fi +} + +toolkit::setup::config() { + local -r destination="${1:-"${TOOLKIT_DIR}"}" + local -r config_path="${destination}/.config/nvidia-container-runtime/config.toml" + log INFO "${FUNCNAME[0]} $*" + + sed -i 's/^#root/root/;' "${config_path}" + sed -i "s@/run/nvidia/driver@${RUN_DIR}/driver@;" "${config_path}" + sed -i "s;@/sbin/ldconfig.real;@${RUN_DIR}/driver/sbin/ldconfig.real;" "${config_path}" +} + +toolkit::setup::cli_binary() { + local -r destination="${1:-"${TOOLKIT_DIR}"}" + log INFO "${FUNCNAME[0]} $*" + + # Setup links to the real binaries to ensure that variables and configs + # are pointing to the right path + mv "${destination}/nvidia-container-cli" \ + "${destination}/nvidia-container-cli.real" + + # Setup aliases so as to ensure that the path is correctly set + cat <<- EOF | tr -s ' \t' > ${destination}/nvidia-container-cli + #! /bin/sh + LD_LIBRARY_PATH="${destination}" \ + PATH="\$PATH:${destination}" \ + ${destination}/nvidia-container-cli.real \ + \$@ + EOF + + # Make sure that the alias files are executable + chmod +x "${destination}/nvidia-container-cli" +} + +toolkit::setup::toolkit_binary() { + local -r destination="${1:-"${TOOLKIT_DIR}"}" + log INFO "${FUNCNAME[0]} $*" + + mv "${destination}/nvidia-container-toolkit" \ + "${destination}/nvidia-container-toolkit.real" + + cat <<- EOF | tr -s ' \t' > ${destination}/nvidia-container-toolkit + #! /bin/sh + PATH="\$PATH:${destination}" \ + ${destination}/nvidia-container-toolkit.real \ + -config "${destination}/.config/nvidia-container-runtime/config.toml" \ + \$@ + EOF + + chmod +x "${destination}/nvidia-container-toolkit" +} + +toolkit::setup::runtime_binary() { + local -r destination="${1:-"${TOOLKIT_DIR}"}" + log INFO "${FUNCNAME[0]} $*" + + mv "${destination}/nvidia-container-runtime" \ + "${destination}/nvidia-container-runtime.real" + + cat <<- EOF | tr -s ' \t' > ${destination}/nvidia-container-runtime + #! /bin/sh + PATH="\$PATH:${destination}" \ + XDG_CONFIG_HOME="${destination}/.config" \ + ${destination}/nvidia-container-runtime.real \ + \$@ + EOF + + chmod +x "${destination}/nvidia-container-runtime" +} + +toolkit::setup() { + local -r destination="${1:-"${TOOLKIT_DIR}"}" + log INFO "Installing the NVIDIA Container Toolkit" + + # shellcheck disable=SC2064 + # We want the expand to happen now rather than at trap time + trap "echo 'Caught signal'; toolkit::uninstall ${destination}" EXIT + toolkit::install "${destination}" + + toolkit::setup::config "${destination}" + toolkit::setup::cli_binary "${destination}" + toolkit::setup::toolkit_binary "${destination}" + toolkit::setup::runtime_binary "${destination}" + + # The runtime shim is still looking for the old binary + # Move to ${destination} to get expanded + # Make symlinks local so that they still refer to the + # local target when mounted on the host + cd "${destination}" + ln -s "./nvidia-container-toolkit" \ + "${destination}/nvidia-container-runtime-hook" + ln -s "./libnvidia-container.so.1."* \ + "${destination}/libnvidia-container.so.1" + cd - + + log INFO "Done setting up the NVIDIA Container Toolkit" +} diff --git a/runtimeconfig/test/docker_test.sh b/runtimeconfig/test/docker_test.sh index a295bd8f25de1fa39c8f0b868082cba2040c6ce2..1d6c9380d0c97377ad88b851abf363e73a13c637 100755 --- a/runtimeconfig/test/docker_test.sh +++ b/runtimeconfig/test/docker_test.sh @@ -7,6 +7,9 @@ readonly dind_name="nvidia-container-runtime-installer" # TODO move rm -rf shared to cleanup testing::cleanup() { + docker run -it --privileged -v "${shared_dir}:/shared" alpine:latest chmod -R 777 /shared + rm -rf "${shared_dir}" || true + docker kill "${dind_name}" || true &> /dev/null docker rm "${dind_name}" || true &> /dev/null @@ -14,10 +17,6 @@ testing::cleanup() { } testing::setup() { - local shared_dir=${1:-"./shared"} - - rm -rf "${shared_dir}" || true - mkdir -p "${shared_dir}" mkdir -p "${shared_dir}"/etc/docker mkdir -p "${shared_dir}"/run/nvidia @@ -25,17 +24,15 @@ testing::setup() { } testing::main() { - local shared_dir="${1:-"./shared"}" - local image="${2:-"nvidia/container-toolkit:docker19.03"}" + local image="${1:-"nvidia/container-toolkit:docker19.03"}" - testing::setup "${shared_dir}" + testing::setup # Docker creates /etc/docker when starting # by default there isn't any config in this directory (even after the daemon starts) docker run --privileged \ -v "${shared_dir}/etc/docker:/etc/docker" \ - -v "${shared_dir}/run/nvidia:/run/nvidia" \ - -v "${shared_dir}/etc/nvidia-container-runtime:/etc/nvidia-container-runtime" \ + -v "${shared_dir}/run/nvidia:/run/nvidia:shared" \ --name "${dind_name}" -d docker:stable-dind -H unix://run/nvidia/docker.sock # Share the volumes so that we can edit the config file and point to the new runtime @@ -44,16 +41,14 @@ testing::main() { --volumes-from "${dind_name}" \ --pid "container:${dind_name}" \ "${image}" \ - bash -x -c "/work/run.sh /run/nvidia unix:///run/nvidia/docker.sock" - - docker run -it --privileged \ - --volumes-from "${dind_name}" \ - alpine:latest chmod 766 /etc/docker /run/nvidia /etc/nvidia-container-runtime + bash -x -c "/work/run.sh /run/nvidia /run/nvidia/docker.sock" testing::cleanup - rm -rf "${shared_dir}" || true } +readonly shared_dir="${1:-"./shared"}" +shift + trap testing::cleanup ERR testing::cleanup