提交 83c90e35 编写于 作者: G gunan 提交者: GitHub

Merge pull request #4646 from caisq/dist-fix-1

Make whl file URL in dist_test Dockerfiles configurable
FROM ubuntu:14.04
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Docker image for testing distributed (GRPC) TensorFlow on Google Container
# Engine (GKE).
#
# See ./remote_test.sh for usage example.
FROM ubuntu:16.04
MAINTAINER Shanqing Cai <cais@google.com>
RUN apt-get update
RUN apt-get install -y --no-install-recommends \
curl \
python \
python-numpy \
python-pip
python-pip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install Google Cloud SDK
RUN curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/install_google_cloud_sdk.bash
......@@ -17,10 +39,11 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
# Install kubectl
RUN /var/gcloud/google-cloud-sdk/bin/gcloud components install kubectl
# Install nightly TensorFlow pip
# Install TensorFlow pip whl
# TODO(cais): Should we build it locally instead?
RUN pip install \
https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
COPY tensorflow-*.whl /
RUN pip install /tensorflow-*.whl
RUN rm -f /tensorflow-*.whl
# Copy test files
COPY scripts /var/tf-dist-test/scripts
......
......@@ -23,19 +23,16 @@ MAINTAINER Shanqing Cai <cais@google.com>
# Pick up some TF dependencies.
RUN apt-get update && apt-get install -y \
curl \
python-numpy \
python-pip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow CPU version from nightly build.
RUN pip --no-cache-dir install \
https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
# Install TensorFlow pip whl
# TODO(cais): Should we build it locally instead?
COPY tensorflow-*.whl /
RUN pip install /tensorflow-*.whl
RUN rm -f /tensorflow-*.whl
ADD . /var/tf_dist_test
......@@ -24,16 +24,20 @@
# 3) Call a script to launch a k8s TensorFlow GRPC cluster inside the container
# and run the distributed test suite.
#
# Usage: local_test.sh [--leave_container_running]
# Usage: local_test.sh <whl_url>
# [--leave_container_running]
# [--model_name <MODEL_NAME>]
# [--num_workers <NUM_WORKERS>]
# [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
# [--sync_replicas]
#
# E.g., local_test.sh --model_name CENSUS_WIDENDEEP
# local_test.sh --num_workers 3 --num_parameter_servers 3
# E.g., local_test.sh <whl_url> --model_name CENSUS_WIDENDEEP
# local_test.sh <whl_url> --num_workers 3 --num_parameter_servers 3
#
# Arguments:
# <whl_url>
# Specify custom TensorFlow whl file URL to install in the test Docker image.
#
# --leave_container_running: Do not stop the docker-in-docker container after
# the termination of the tests, e.g., for debugging
#
......@@ -48,6 +52,7 @@
# (workers) will be aggregated before applied, which avoids stale parameter
# updates.
#
#
# In addition, this script obeys the following environment variables:
# TF_DIST_DOCKER_NO_CACHE: do not use cache when building docker images
......@@ -72,6 +77,11 @@ NUM_WORKERS=2
NUM_PARAMETER_SERVERS=2
SYNC_REPLICAS_FLAG=""
WHL_URL=${1}
if [[ -z "${WHL_URL}" ]]; then
die "whl file URL is not specified"
fi
while true; do
if [[ $1 == "--leave_container_running" ]]; then
LEAVE_CONTAINER_RUNNING=1
......@@ -84,6 +94,8 @@ while true; do
NUM_PARAMETER_SERVERS=$2
elif [[ $1 == "--sync_replicas" ]]; then
SYNC_REPLICAS_FLAG="--sync_replicas"
elif [[ $1 == "--whl_url" ]]; then
WHL_URL=$2
fi
shift
......@@ -104,25 +116,35 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Get utility functions
source ${DIR}/scripts/utils.sh
# First, make sure that no docker-in-docker container of the same image
# is already running
if [[ ! -z $(get_container_id_by_image_name ${DOCKER_IMG_NAME}) ]]; then
die "It appears that there is already at least one Docker container "\
"of image name ${DOCKER_IMG_NAME} running. Please stop it before trying again"
fi
# Build docker-in-docker image for local k8s cluster
# Build docker-in-docker image for local k8s cluster.
NO_CACHE_FLAG=""
if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] &&
[[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then
NO_CACHE_FLAG="--no-cache"
fi
# Create docker build context directory.
BUILD_DIR=$(mktemp -d)
echo ""
echo "Using whl file URL: ${WHL_URL}"
echo "Building in temporary directory: ${BUILD_DIR}"
cp -r ${DIR}/* "${BUILD_DIR}"/ || \
die "Failed to copy files to ${BUILD_DIR}"
# Download whl file into the build context directory.
wget -P "${BUILD_DIR}" ${WHL_URL} || \
die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
# Build docker image for test.
docker build ${NO_CACHE_FLAG} -t ${DOCKER_IMG_NAME} \
-f ${DIR}/Dockerfile.local ${DIR} || \
-f "${BUILD_DIR}/Dockerfile.local" "${BUILD_DIR}" || \
die "Failed to build docker image: ${DOCKER_IMG_NAME}"
# Clean up docker build context directory.
rm -rf "${BUILD_DIR}"
# Run docker image for test.
docker run ${DOCKER_IMG_NAME} \
/var/tf_dist_test/scripts/dist_mnist_test.sh \
--ps_hosts "localhost:2000,localhost:2001" \
......
......@@ -20,13 +20,17 @@
# runs from within a container based on the image.
#
# Usage:
# remote_test.sh [--setup_cluster_only]
# remote_test.sh <whl_url>
# [--setup_cluster_only]
# [--num_workers <NUM_WORKERS>]
# [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
# [--sync_replicas]
#
# Arguments:
# --setup_cluster_only:
# <whl_url>
# Specify custom TensorFlow whl file URL to install in the test Docker image.
#
# --setup_cluster_only:
# Setup the TensorFlow k8s cluster only, and do not perform testing of
# the distributed runtime.
#
......@@ -42,6 +46,7 @@
# updates.
#
#
#
# If any of the following environment variable has non-empty values, it will
# be mapped into the docker container to override the default values (see
# dist_test.sh)
......@@ -95,8 +100,34 @@ if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] &&
NO_CACHE_FLAG="--no-cache"
fi
# Parse command-line arguments.
WHL_URL=${1}
if [[ -z "${WHL_URL}" ]]; then
die "whl URL is not specified"
fi
# Create docker build context directory.
BUILD_DIR=$(mktemp -d)
echo ""
echo "Using custom whl file URL: ${WHL_URL}"
echo "Building in temporary directory: ${BUILD_DIR}"
cp -r ${DIR}/* ${BUILD_DIR}/ || \
die "Failed to copy files to ${BUILD_DIR}"
# Download whl file into the build context directory.
wget -P "${BUILD_DIR}" ${WHL_URL} || \
die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
# Build docker image for test.
docker build ${NO_CACHE_FLAG} \
-t ${DOCKER_IMG_NAME} -f "${DIR}/Dockerfile" "${DIR}"
-t ${DOCKER_IMG_NAME} -f "${BUILD_DIR}/Dockerfile" "${BUILD_DIR}" || \
die "Failed to build docker image: ${DOCKER_IMG_NAME}"
# Clean up docker build context directory.
rm -rf "${BUILD_DIR}"
# Run docker image for test.
KEY_FILE=${TF_DIST_GCLOUD_KEY_FILE:-"${HOME}/gcloud-secrets/tensorflow-testing.json"}
docker run --rm -v ${KEY_FILE}:/var/gcloud/secrets/tensorflow-testing.json \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册