提交 f09ab93a 编写于 作者: R Renaud Gaubert

Refactor runtimeconfig

Signed-off-by: NRenaud Gaubert <rgaubert@nvidia.com>
上级 ec41181e
......@@ -6,24 +6,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
gnupg2 \
jq \
moreutils \
software-properties-common
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - && \
RUN curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - && \
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | \
tee /etc/apt/sources.list.d/nvidia-docker.list && \
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) stable"
tee /etc/apt/sources.list.d/nvidia-docker.list
RUN apt-get update && apt-get install -y --no-install-recommends \
docker-ce-cli \
nvidia-container-runtime
WORKDIR /work
COPY src/. .
RUN cp /etc/nvidia-container-runtime/config.toml ./
RUN chmod +x /work/run.sh
......
......@@ -27,3 +27,30 @@ log() {
printf "%s[%s]%s %b\n" "${fmt_on}" "${level}" "${fmt_off}" "${message}" >&2
}
with_retry() {
local max_attempts="$1"
local delay="$2"
local count=0
local rc
shift 2
while true; do
set +e
"$@"; rc="$?"
set -e
count="$((count+1))"
if [[ "${rc}" -eq 0 ]]; then
return 0
fi
if [[ "${max_attempts}" -le 0 ]] || [[ "${count}" -lt "${max_attempts}" ]]; then
sleep "${delay}"
else
break
fi
done
return 1
}
#! /bin/bash
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
readonly DOCKER_CONFIG="/etc/docker/daemon.json"
docker::info() {
local -r docker_socket="${1:-unix:///var/run/docker.socket}"
local -r docker_socket="${1:-/var/run/docker.sock}"
curl --unix-socket "${docker_socket}" 'http://v1.40/info' | jq '.Runtimes.nvidia.path'
}
# Docker in Docker has a startup race
for i in $(seq 1 5); do
# Calling in a subshell so that we can recover from a failure
if [[ ! $(docker -H "${docker_socket}" info -f '{{json .Runtimes}}') ]]; then
sleep 2
continue
fi
docker::ensure::mounted() {
mount | grep /etc/docker
if [[ ! $? ]]; then
log ERROR "Docker directory isn't mounted in container"
log ERROR "Ensure that you have correctly mounted the docker directoy"
exit 1
fi
}
docker::ensure::config_dir() {
# Ensure that the docker config path exists
if [[ ! -d "/etc/docker" ]]; then
log ERROR "Docker directory doesn't exist in container"
log ERROR "Ensure that you have correctly mounted the docker directoy"
exit 1
fi
docker -H "${docker_socket}" info -f '{{json .Runtimes}}' | jq '.nvidia.path'
return
done
exit 1
}
# Echo an empty config if the config file doesn't exist
docker::daemon_config() {
local -r daemon_file="${1:-"/etc/docker/daemon.json"}"
([[ -f "${daemon_file}" ]] && cat "${daemon_file}") || echo {}
docker::config::backup() {
if [[ -f "${DOCKER_CONFIG}" ]]; then
mv "${DOCKER_CONFIG}" "${DOCKER_CONFIG}.bak"
fi
}
docker::refresh_configuration() {
log INFO "Refreshing the docker daemon configuration"
pkill -SIGHUP dockerd
docker::config::restore() {
if [[ -f "${DOCKER_CONFIG}" ]]; then
mv "${DOCKER_CONFIG}.bak" "${DOCKER_CONFIG}"
fi
}
docker::update_config_file() {
docker::config::add_runtime() {
local -r destination="${1:-/run/nvidia}"
local -r nvcr="${destination}/nvidia-container-runtime"
local config_json
IFS='' read -r config_json
echo "${config_json}" | \
cat - | \
jq -r ".runtimes = {}" | \
jq -r ".runtimes += {\"nvidia\": {\"path\": \"${nvcr}\"}}" | \
jq -r '. += {"default-runtime": "nvidia"}'
}
docker::ensure_prerequisites() {
# Ensure that the docker config path exists
if [[ ! -d "/etc/docker" ]]; then
log ERROR "Docker directory doesn't exist in container"
log ERROR "Ensure that you have correctly mounted the docker directoy"
exit 1
fi
docker::config() {
([[ -f "${DOCKER_CONFIG}" ]] && cat "${DOCKER_CONFIG}") || echo {}
}
mount | grep /etc/docker
if [[ ! $? ]]; then
log ERROR "Docker directory isn't mounted in container"
log ERROR "Ensure that you have correctly mounted the docker directoy"
exit 1
fi
docker::config::refresh() {
log INFO "Refreshing the docker daemon configuration"
pkill -SIGHUP dockerd
}
docker::config::get_nvidia_runtime() {
cat - | jq -r '.runtimes | keys[0]'
}
docker::setup() {
local -r destination="${1:-/run/nvidia}"
docker::ensure::mounted
docker::ensure::config_dir
log INFO "Setting up the configuration for the docker daemon"
docker::ensure_prerequisites
local -r destination="${1:-/run/nvidia}"
local -r docker_socket="${2:-"/var/run/docker.socket"}"
local config=$(docker::config)
log INFO "current config: ${config}"
log INFO "current config: $(docker::daemon_config)"
local -r nvidia_runtime="$(with_retry 5 2s docker::info "${docker_socket}")"
if [[ "${nvidia_runtime}" = "${destination}/nvidia-container-runtime" ]]; then
return
fi
# Append the nvidia runtime to the docker daemon's configuration
# We use sponge here because the input file is the output file
config=$(docker::daemon_config | docker::update_config_file "${destination}")
echo "${config}" > /etc/docker/daemon.json
local updated_config=$(echo "${config}" | docker::config::add_runtime "${destination}")
local -r config_runtime=$(echo "${updated_config}" | docker::config::get_nvidia_runtime)
# If there was an error while parsing the file catch it here
if [[ "${config_runtime}" != "nvidia" ]]; then
config=$(echo "{}" | docker::config::add_runtime "${destination}")
fi
docker::config::backup
echo "${updated_config}" > /etc/docker/daemon.json
log INFO "after: $(docker::daemon_config | jq .)"
docker::refresh_configuration
log INFO "after: $(docker::config | jq .)"
docker::config::refresh
}
......@@ -4,80 +4,33 @@
set -euxo pipefail
shopt -s lastpipe
source "common.sh"
source "docker.sh"
readonly RUN_DIR="/run/nvidia"
readonly TOOLKIT_DIR="${RUN_DIR}/toolkit"
install_nvidia_container_runtime_toolkit() {
log INFO "Installing the NVIDIA Container Runtime Toolkit"
readonly basedir="$(dirname "$(realpath "$0")")"
local -r destination="${1:-/run/nvidia}"
local -a packages=("/usr/bin/nvidia-container-runtime" \
"/usr/bin/nvidia-container-toolkit" \
"/usr/bin/nvidia-container-cli" \
"/etc/nvidia-container-runtime/config.toml" \
"/usr/lib/x86_64-linux-gnu/libnvidia-container.so.1")
# TODO workaround until we fix the runtime requiring this
# directory and file to exist at that location
cp ./config.toml /etc/nvidia-container-runtime
# Bash variables starts at 0
# ZSH variables starts at 1
for ((i=0; i < ${#packages[@]}; i++)); do
packages[$i]=$(readlink -f ${packages[$i]})
done
if [[ ! -d "${destination}" ]]; then
log ERROR "Destination directory doesn't exist in container"
log ERROR "Ensure that you have correctly mounted the destination directoy"
exit 1
fi
cp "${packages[@]}" "${destination}"
# Setup links to the real binaries to ensure that variables and configs
# are pointing to the right path
mv "${destination}/nvidia-container-toolkit" \
"${destination}/nvidia-container-toolkit.real"
mv "${destination}/nvidia-container-runtime" \
"${destination}/nvidia-container-runtime.real"
# Setup aliases so as to ensure that the path is correctly set
cat <<- EOF > ${destination}/nvidia-container-toolkit
#! /bin/sh
LD_LIBRARY_PATH="${destination}" \
PATH="\$PATH:${destination}" \
${destination}/nvidia-container-toolkit.real \
-config "${destination}/config.toml" \
\$@
EOF
cat <<- EOF > ${destination}/nvidia-container-runtime
#! /bin/sh
LD_LIBRARY_PATH="${destination}" \
PATH="\$PATH:${destination}" \
${destination}/nvidia-container-runtime.real \
\$@
EOF
# Make sure that the alias files are executable
chmod +x "${destination}/nvidia-container-toolkit"
chmod +x "${destination}/nvidia-container-runtime"
}
source "${basedir}/common.sh"
source "${basedir}/toolkit.sh"
source "${basedir}/docker.sh"
main() {
local -r destination="${1:-/run/nvidia}"
local -r docker_socket="${2:-/var/run/docker.socket}"
local -r nvidia_runtime="$(docker::info ${docker_socket})"
if [[ "${nvidia_runtime}" = "${destination}/nvidia-container-runtime" ]]; then
exit 0
fi
install_nvidia_container_runtime_toolkit "${destination}"
docker::setup "${destination}"
echo "docker info: $(docker::info ${docker_socket})"
local -r destination="${1:-"${RUN_DIR}"}/toolkit"
local -r docker_socket="${2:-"/var/run/docker.socket"}"
toolkit::setup "${destination}"
docker::setup "${destination}" "${docker_socket}"
echo "docker info: $(docker::info "${docker_socket}")"
echo "Done, now waiting for signal"
sleep infinity &
# shellcheck disable=SC2064
# We want the expand to happen now rather than at trap time
# Setup a new signal handler and reset the EXIT signal handler
trap "echo 'Caught signal'; toolkit::uninstall && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
trap - EXIT
while true; do wait $! || continue; done
exit 0
}
main "$@"
#! /bin/bash
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
packages=("/usr/bin/nvidia-container-runtime" \
"/usr/bin/nvidia-container-toolkit" \
"/usr/bin/nvidia-container-cli" \
"/etc/nvidia-container-runtime/config.toml" \
"/usr/lib/x86_64-linux-gnu/libnvidia-container.so.1")
toolkit::install() {
local -r destination="${1:-"${TOOLKIT_DIR}"}"
log INFO "${FUNCNAME[0]} $*"
mkdir -p "/nvidia" "${destination}"
mount --rbind "/nvidia" "${destination}"
mkdir -p "${destination}"
mkdir -p "${destination}/.config/nvidia-container-runtime"
# Note: Bash arrays start at 0 (zsh arrays start at 1)
for ((i=0; i < ${#packages[@]}; i++)); do
packages[$i]=$(readlink -f "${packages[$i]}")
done
cp "${packages[@]}" "${destination}"
mv "${destination}/config.toml" "${destination}/.config/nvidia-container-runtime/"
}
toolkit::uninstall() {
local -r destination="${1:-"${TOOLKIT_DIR}"}"
log INFO "${FUNCNAME[0]} $*"
if findmnt -r -o TARGET | grep "${destination}" > /dev/null; then
umount -l -R "${destination}"
fi
}
toolkit::setup::config() {
local -r destination="${1:-"${TOOLKIT_DIR}"}"
local -r config_path="${destination}/.config/nvidia-container-runtime/config.toml"
log INFO "${FUNCNAME[0]} $*"
sed -i 's/^#root/root/;' "${config_path}"
sed -i "s@/run/nvidia/driver@${RUN_DIR}/driver@;" "${config_path}"
sed -i "s;@/sbin/ldconfig.real;@${RUN_DIR}/driver/sbin/ldconfig.real;" "${config_path}"
}
toolkit::setup::cli_binary() {
local -r destination="${1:-"${TOOLKIT_DIR}"}"
log INFO "${FUNCNAME[0]} $*"
# Setup links to the real binaries to ensure that variables and configs
# are pointing to the right path
mv "${destination}/nvidia-container-cli" \
"${destination}/nvidia-container-cli.real"
# Setup aliases so as to ensure that the path is correctly set
cat <<- EOF | tr -s ' \t' > ${destination}/nvidia-container-cli
#! /bin/sh
LD_LIBRARY_PATH="${destination}" \
PATH="\$PATH:${destination}" \
${destination}/nvidia-container-cli.real \
\$@
EOF
# Make sure that the alias files are executable
chmod +x "${destination}/nvidia-container-cli"
}
toolkit::setup::toolkit_binary() {
local -r destination="${1:-"${TOOLKIT_DIR}"}"
log INFO "${FUNCNAME[0]} $*"
mv "${destination}/nvidia-container-toolkit" \
"${destination}/nvidia-container-toolkit.real"
cat <<- EOF | tr -s ' \t' > ${destination}/nvidia-container-toolkit
#! /bin/sh
PATH="\$PATH:${destination}" \
${destination}/nvidia-container-toolkit.real \
-config "${destination}/.config/nvidia-container-runtime/config.toml" \
\$@
EOF
chmod +x "${destination}/nvidia-container-toolkit"
}
toolkit::setup::runtime_binary() {
local -r destination="${1:-"${TOOLKIT_DIR}"}"
log INFO "${FUNCNAME[0]} $*"
mv "${destination}/nvidia-container-runtime" \
"${destination}/nvidia-container-runtime.real"
cat <<- EOF | tr -s ' \t' > ${destination}/nvidia-container-runtime
#! /bin/sh
PATH="\$PATH:${destination}" \
XDG_CONFIG_HOME="${destination}/.config" \
${destination}/nvidia-container-runtime.real \
\$@
EOF
chmod +x "${destination}/nvidia-container-runtime"
}
toolkit::setup() {
local -r destination="${1:-"${TOOLKIT_DIR}"}"
log INFO "Installing the NVIDIA Container Toolkit"
# shellcheck disable=SC2064
# We want the expand to happen now rather than at trap time
trap "echo 'Caught signal'; toolkit::uninstall ${destination}" EXIT
toolkit::install "${destination}"
toolkit::setup::config "${destination}"
toolkit::setup::cli_binary "${destination}"
toolkit::setup::toolkit_binary "${destination}"
toolkit::setup::runtime_binary "${destination}"
# The runtime shim is still looking for the old binary
# Move to ${destination} to get expanded
# Make symlinks local so that they still refer to the
# local target when mounted on the host
cd "${destination}"
ln -s "./nvidia-container-toolkit" \
"${destination}/nvidia-container-runtime-hook"
ln -s "./libnvidia-container.so.1."* \
"${destination}/libnvidia-container.so.1"
cd -
log INFO "Done setting up the NVIDIA Container Toolkit"
}
......@@ -7,6 +7,9 @@ readonly dind_name="nvidia-container-runtime-installer"
# TODO move rm -rf shared to cleanup
testing::cleanup() {
docker run -it --privileged -v "${shared_dir}:/shared" alpine:latest chmod -R 777 /shared
rm -rf "${shared_dir}" || true
docker kill "${dind_name}" || true &> /dev/null
docker rm "${dind_name}" || true &> /dev/null
......@@ -14,10 +17,6 @@ testing::cleanup() {
}
testing::setup() {
local shared_dir=${1:-"./shared"}
rm -rf "${shared_dir}" || true
mkdir -p "${shared_dir}"
mkdir -p "${shared_dir}"/etc/docker
mkdir -p "${shared_dir}"/run/nvidia
......@@ -25,17 +24,15 @@ testing::setup() {
}
testing::main() {
local shared_dir="${1:-"./shared"}"
local image="${2:-"nvidia/container-toolkit:docker19.03"}"
local image="${1:-"nvidia/container-toolkit:docker19.03"}"
testing::setup "${shared_dir}"
testing::setup
# Docker creates /etc/docker when starting
# by default there isn't any config in this directory (even after the daemon starts)
docker run --privileged \
-v "${shared_dir}/etc/docker:/etc/docker" \
-v "${shared_dir}/run/nvidia:/run/nvidia" \
-v "${shared_dir}/etc/nvidia-container-runtime:/etc/nvidia-container-runtime" \
-v "${shared_dir}/run/nvidia:/run/nvidia:shared" \
--name "${dind_name}" -d docker:stable-dind -H unix://run/nvidia/docker.sock
# Share the volumes so that we can edit the config file and point to the new runtime
......@@ -44,16 +41,14 @@ testing::main() {
--volumes-from "${dind_name}" \
--pid "container:${dind_name}" \
"${image}" \
bash -x -c "/work/run.sh /run/nvidia unix:///run/nvidia/docker.sock"
docker run -it --privileged \
--volumes-from "${dind_name}" \
alpine:latest chmod 766 /etc/docker /run/nvidia /etc/nvidia-container-runtime
bash -x -c "/work/run.sh /run/nvidia /run/nvidia/docker.sock"
testing::cleanup
rm -rf "${shared_dir}" || true
}
readonly shared_dir="${1:-"./shared"}"
shift
trap testing::cleanup ERR
testing::cleanup
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册