From f9db5629873b117c226b15858f128dd2c1f9fd16 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 31 Jan 2018 20:09:08 +0800 Subject: [PATCH] update results --- benchmark/cluster/vgg16/Dockerfile | 16 +- benchmark/cluster/vgg16/README.md | 2 +- benchmark/cluster/vgg16/fluid_trainer.yaml | 2 +- benchmark/cluster/vgg16/k8s_tools.py | 94 ---------- benchmark/cluster/vgg16/paddle_k8s | 199 --------------------- benchmark/cluster/vgg16/reader.py | 16 -- benchmark/cluster/vgg16/v2_trainer.yaml | 2 +- benchmark/cluster/vgg16/vgg16_v2.py | 2 +- 8 files changed, 11 insertions(+), 322 deletions(-) delete mode 100644 benchmark/cluster/vgg16/k8s_tools.py delete mode 100755 benchmark/cluster/vgg16/paddle_k8s delete mode 100644 benchmark/cluster/vgg16/reader.py diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile index dfaffb8c213..c34f7e8fcff 100644 --- a/benchmark/cluster/vgg16/Dockerfile +++ b/benchmark/cluster/vgg16/Dockerfile @@ -1,15 +1,13 @@ -#FROM paddlepaddle/paddlecloud-job -#RUN mkdir -p /workspace -#ADD reader.py /workspace/ -#RUN python /workspace/reader.py FROM python:2.7.14 -ADD paddle_k8s /usr/bin -ADD k8s_tools.py /root -RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root +RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev && \ +chmod +x /usr/bin/paddle_k8s +# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, +# so we must build one with distribute support to install in this image. ADD *.whl / RUN pip install /*.whl && rm -f /*.whl ENV LD_LIBRARY_PATH=/usr/local/lib -ADD reader.py /workspace/ -RUN python /workspace/reader.py +RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python' ADD vgg16_fluid.py vgg16_v2.py /workspace/ diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index c1e85a2c407..0c404e60a87 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -43,7 +43,7 @@ | Trainer Counter | 20 | 40 | 80 | 100 | | -- | -- | -- | -- | -- | | PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 | -| PaddlePaddle v2 | 356.28 | - | - | 1041.99 | +| PaddlePaddle v2 (need more tests) | 356.28 | 785.39 | 853.30 | 1041.99 | | TensorFlow | - | - | - | - | ### different pserver number diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml index 2f6a87ab02a..0a0ed25ebe4 100644 --- a/benchmark/cluster/vgg16/fluid_trainer.yaml +++ b/benchmark/cluster/vgg16/fluid_trainer.yaml @@ -30,7 +30,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256" + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT diff --git a/benchmark/cluster/vgg16/k8s_tools.py b/benchmark/cluster/vgg16/k8s_tools.py deleted file mode 100644 index 4bee96a7a80..00000000000 --- a/benchmark/cluster/vgg16/k8s_tools.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/bin/env python -import os -import sys -import time -import socket -from kubernetes import client, config -PADDLE_JOB_NAME = os.getenv("PADDLE_JOB_NAME") -NAMESPACE = os.getenv("NAMESPACE") -PORT = os.getenv("PSERVER_PORT") -if os.getenv("KUBERNETES_SERVICE_HOST", None): - config.load_incluster_config() -else: - config.load_kube_config() -v1 = client.CoreV1Api() - - -def fetch_pods_info(label_selector): - api_response = v1.list_namespaced_pod( - namespace=NAMESPACE, pretty=True, label_selector=label_selector) - pod_list = [] - for item in api_response.items: - pod_list.append((item.status.phase, item.status.pod_ip)) - return pod_list - - -def wait_pods_running(label_selector, desired): - print "label selector: %s, desired: %s" % (label_selector, desired) - while True: - count = count_pods_by_phase(label_selector, 'Running') - # NOTE: pods may be scaled. - if count >= int(desired): - break - print 'current cnt: %d sleep for 5 seconds...' % count - time.sleep(5) - - -def count_pods_by_phase(label_selector, phase): - pod_list = fetch_pods_info(label_selector) - filtered_pod_list = filter(lambda x: x[0] == phase, pod_list) - return len(filtered_pod_list) - - -def fetch_pserver_ips(): - label_selector = "paddle-job-pserver=%s" % PADDLE_JOB_NAME - pod_list = fetch_pods_info(label_selector) - pserver_ips = [item[1] for item in pod_list] - return ",".join(pserver_ips) - - -def fetch_master_ip(): - label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME - pod_list = fetch_pods_info(label_selector) - master_ips = [item[1] for item in pod_list] - return master_ips[0] - - -def fetch_trainer_id(): - label_selector = "paddle-job=%s" % PADDLE_JOB_NAME - pod_list = fetch_pods_info(label_selector) - trainer_ips = [item[1] for item in pod_list] - trainer_ips.sort() - local_ip = socket.gethostbyname(socket.gethostname()) - for i in xrange(len(trainer_ips)): - if trainer_ips[i] == local_ip: - return i - return None - - -if __name__ == "__main__": - command = sys.argv[1] - if command == "fetch_pserver_ips": - print fetch_pserver_ips() - elif command == "fetch_trainer_id": - print fetch_trainer_id() - elif command == "fetch_master_ip": - print fetch_master_ip() - elif command == "count_pods_by_phase": - print count_pods_by_phase(sys.argv[2], sys.argv[3]) - elif command == "wait_pods_running": - wait_pods_running(sys.argv[2], sys.argv[3]) diff --git a/benchmark/cluster/vgg16/paddle_k8s b/benchmark/cluster/vgg16/paddle_k8s deleted file mode 100755 index af5f35b3eca..00000000000 --- a/benchmark/cluster/vgg16/paddle_k8s +++ /dev/null @@ -1,199 +0,0 @@ -#!/bin/bash -start_pserver() { - stdbuf -oL paddle pserver \ - --use_gpu=0 \ - --port=$PADDLE_INIT_PORT \ - --ports_num=$PADDLE_INIT_PORTS_NUM \ - --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \ - --nics=$PADDLE_INIT_NICS \ - --comment=paddle_process_k8s \ - --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS -} - -start_new_pserver() { - stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-master=${PADDLE_JOB_NAME} 1 - export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip) - stdbuf -oL /usr/bin/pserver \ - -port=$PADDLE_INIT_PORT \ - -num-pservers=$PSERVERS \ - -log-level=debug \ - -etcd-endpoint=http://$MASTER_IP:2379 -} - -start_master() { - stdbuf -oL /usr/bin/master \ - -port=8080 \ - -chunk-per-task=1\ - -task-timout-dur=16s\ - -endpoints=http://127.0.0.1:2379 -} - -check_failed_cnt() { - max_failed=$1 - failed_count=$(python /root/k8s_tools.py count_pods_by_phase paddle-job=${PADDLE_JOB_NAME} Failed) - if [ $failed_count -gt $max_failed ]; then - stdbuf -oL echo "Failed trainer count beyond the threadhold: "$max_failed - echo "Failed trainer count beyond the threshold: " $max_failed > /dev/termination-log - exit 0 - fi -} - -check_trainer_ret() { - ret=$1 - stdbuf -oL echo "job returned $ret...setting pod return message..." - stdbuf -oL echo "===============================" - - if [ $ret -eq 136 ] ; then - echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log - elif [ $ret -eq 139 ] ; then - echo "Segmentation Fault" > /dev/termination-log - elif [ $ret -eq 1 ] ; then - echo "General Error" > /dev/termination-log - elif [ $ret -eq 134 ] ; then - echo "Program Abort" > /dev/termination-log - fi - stdbuf -oL echo "termination log wroted..." - exit $ret -} - -start_fluid_process() { - stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS} - if [ "${TRAINING_ROLE}" == "TRAINER" ]; then - check_failed_cnt ${TRAINERS} - sleep 5 - export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id) - fi - export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips) - stdbuf -oL sh -c "${ENTRY}" - check_trainer_ret $? -} - -start_new_trainer() { - # FIXME(Yancey1989): use command-line interface to configure the max failed count - check_failed_cnt ${TRAINERS} - stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS} - sleep 5 - stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-master=${PADDLE_JOB_NAME} 1 - export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip) - export ETCD_IP="$MASTER_IP" - - # NOTE: $TRAINER_PACKAGE may be large, do not copy - export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH - cd $TRAINER_PACKAGE - - stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \ - $PADDLE_INIT_NUM_GRADIENT_SERVERS, "version: " $1 - - stdbuf -oL sh -c "${ENTRY}" - check_trainer_ret $? -} - -start_trainer() { - # paddle v1 and V2 distributed training does not allow any trainer failed. - check_failed_cnt 0 - stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS} - stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job=${PADDLE_JOB_NAME} ${TRAINERS} - - export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips) - export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id) - stdbuf -oL echo $PADDLE_INIT_TRAINER_ID > /trainer_id - # FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS - stdbuf -oL echo $PADDLE_INIT_NUM_GRADIENT_SERVERS > /trainer_count - - # NOTE: $TRAINER_PACKAGE may be large, do not copy - export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH - cd $TRAINER_PACKAGE - - stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \ - $PADDLE_INIT_NUM_GRADIENT_SERVERS, "trainer_id: " $PADDLE_INIT_TRAINER_ID, \ - "version: " $1 - - # FIXME: If we use the new PServer by Golang, add Kubernetes healthz - # to wait PServer process get ready.Now only sleep 20 seconds. - sleep 20 - - case "$1" in - "v1") - FILE_COUNT=$(wc -l $TRAIN_LIST | awk '{print $1}') - if [ $FILE_COUNT -le $PADDLE_INIT_NUM_GRADIENT_SERVERS ]; then - echo "file count less than trainers" - check_trainer_ret 0 - fi - let lines_per_node="$FILE_COUNT / ($PADDLE_INIT_NUM_GRADIENT_SERVERS + 1)" - echo "spliting file to" $lines_per_node - cp $TRAIN_LIST / - cd / - split -l $lines_per_node -d -a 3 $TRAIN_LIST train.list - CURRENT_LIST=$(printf "train.list%03d" $PADDLE_INIT_TRAINER_ID) - # always use /train.list for paddle v1 for each node. - echo "File for current node ${CURRENT_LIST}" - sleep 10 - cp $CURRENT_LIST train.list - - cd $TRAINER_PACKAGE - - stdbuf -oL paddle train \ - --port=$PADDLE_INIT_PORT \ - --nics=$PADDLE_INIT_NICS \ - --ports_num=$PADDLE_INIT_PORTS_NUM \ - --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \ - --num_passes=$PADDLE_INIT_NUM_PASSES \ - --trainer_count=$PADDLE_INIT_TRAINER_COUNT \ - --saving_period=1 \ - --log_period=20 \ - --local=0 \ - --rdma_tcp=tcp \ - --config=$TOPOLOGY \ - --use_gpu=$PADDLE_INIT_USE_GPU \ - --trainer_id=$PADDLE_INIT_TRAINER_ID \ - --save_dir=$OUTPUT \ - --pservers=$PADDLE_INIT_PSERVERS \ - --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS - # paddle v1 API does not allow any trainer failed. - check_trainer_ret $? - ;; - "v2") - stdbuf -oL sh -c "${ENTRY}" - # paddle v2 API does not allow any trainer failed. - check_trainer_ret $? - ;; - *) - ;; - esac -} - -usage() { - echo "usage: paddle_k8s []:" - echo " start_trainer [v1|v2] Start a trainer process with v1 or v2 API" - echo " start_pserver Start a pserver process" - echo " start_new_pserver Start a new pserver process" - echo " start_new_trainer Start a new triner process" -} - -case "$1" in - start_pserver) - start_pserver - ;; - start_trainer) - start_trainer $2 - ;; - start_new_trainer) - start_new_trainer - ;; - start_new_pserver) - start_new_pserver - ;; - start_master) - start_master - ;; - start_fluid) - start_fluid_process - ;; - --help) - usage - ;; - *) - usage - ;; -esac - diff --git a/benchmark/cluster/vgg16/reader.py b/benchmark/cluster/vgg16/reader.py deleted file mode 100644 index 3e20f830fce..00000000000 --- a/benchmark/cluster/vgg16/reader.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle -paddle.dataset.cifar.train10() diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml index 997bbc81c99..12c8964066c 100644 --- a/benchmark/cluster/vgg16/v2_trainer.yaml +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -38,7 +38,7 @@ spec: - name: PADDLE_INIT_NICS value: "xgbe0" - name: PADDLE_INIT_TRAINER_COUNT - value: "2" + value: "1" - name: PADDLE_INIT_PORTS_NUM value: "1" - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py index 81ddeb03323..6ac6b3c3325 100644 --- a/benchmark/cluster/vgg16/vgg16_v2.py +++ b/benchmark/cluster/vgg16/vgg16_v2.py @@ -51,7 +51,7 @@ def vgg(input, nums, class_dim): conv4 = conv_block(conv3, 512, nums[3]) conv5 = conv_block(conv4, 512, nums[4]) - fc_dim = 4096 + fc_dim = 512 fc1 = paddle.layer.fc(input=conv5, size=fc_dim, act=paddle.activation.Relu(), -- GitLab