update results

f9db5629 · typhoonzero · bd64719a · f9db5629 · f9db5629 · f9db5629
8 changed file
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
-#FROM paddlepaddle/paddlecloud-job
-#RUN mkdir -p /workspace
-#ADD reader.py /workspace/
-#RUN python /workspace/reader.py
 FROM python:2.7.14
-ADD paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD k8s_tools.py /root
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev 
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev && \
+chmod +x /usr/bin/paddle_k8s
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD reader.py /workspace/
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN python /workspace/reader.py
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -43,7 +43,7 @@
 | Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
-| PaddlePaddle v2 | 356.28 | - | - | 1041.99 |
+| PaddlePaddle v2 (need more tests) | 356.28 | 785.39 | 853.30 | 1041.99 |
 | TensorFlow | - | - | - | - |
 ### different pserver number

--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -30,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT

--- a/benchmark/cluster/vgg16/k8s_tools.py
+++ b/benchmark/cluster/vgg16/k8s_tools.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!/bin/env python
-import os
-import sys
-import time
-import socket
-from kubernetes import client, config
-PADDLE_JOB_NAME = os.getenv("PADDLE_JOB_NAME")
-NAMESPACE = os.getenv("NAMESPACE")
-PORT = os.getenv("PSERVER_PORT")
-if os.getenv("KUBERNETES_SERVICE_HOST", None):
-    config.load_incluster_config()
-else:
-    config.load_kube_config()
-v1 = client.CoreV1Api()
-def fetch_pods_info(label_selector):
-    api_response = v1.list_namespaced_pod(
-        namespace=NAMESPACE, pretty=True, label_selector=label_selector)
-    pod_list = []
-    for item in api_response.items:
-        pod_list.append((item.status.phase, item.status.pod_ip))
-    return pod_list
-def wait_pods_running(label_selector, desired):
-    print "label selector: %s, desired: %s" % (label_selector, desired)
-    while True:
-        count = count_pods_by_phase(label_selector, 'Running')
-        # NOTE: pods may be scaled.
-        if count >= int(desired):
-            break
-        print 'current cnt: %d sleep for 5 seconds...' % count
-        time.sleep(5)
-def count_pods_by_phase(label_selector, phase):
-    pod_list = fetch_pods_info(label_selector)
-    filtered_pod_list = filter(lambda x: x[0] == phase, pod_list)
-    return len(filtered_pod_list)
-def fetch_pserver_ips():
-    label_selector = "paddle-job-pserver=%s" % PADDLE_JOB_NAME
-    pod_list = fetch_pods_info(label_selector)
-    pserver_ips = [item[1] for item in pod_list]
-    return ",".join(pserver_ips)
-def fetch_master_ip():
-    label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
-    pod_list = fetch_pods_info(label_selector)
-    master_ips = [item[1] for item in pod_list]
-    return master_ips[0]
-def fetch_trainer_id():
-    label_selector = "paddle-job=%s" % PADDLE_JOB_NAME
-    pod_list = fetch_pods_info(label_selector)
-    trainer_ips = [item[1] for item in pod_list]
-    trainer_ips.sort()
-    local_ip = socket.gethostbyname(socket.gethostname())
-    for i in xrange(len(trainer_ips)):
-        if trainer_ips[i] == local_ip:
-            return i
-    return None
-if __name__ == "__main__":
-    command = sys.argv[1]
-    if command == "fetch_pserver_ips":
-        print fetch_pserver_ips()
-    elif command == "fetch_trainer_id":
-        print fetch_trainer_id()
-    elif command == "fetch_master_ip":
-        print fetch_master_ip()
-    elif command == "count_pods_by_phase":
-        print count_pods_by_phase(sys.argv[2], sys.argv[3])
-    elif command == "wait_pods_running":
-        wait_pods_running(sys.argv[2], sys.argv[3])
--- a/benchmark/cluster/vgg16/paddle_k8s
+++ b/benchmark/cluster/vgg16/paddle_k8s
-#!/bin/bash
-start_pserver() {
-    stdbuf -oL paddle pserver \
-      --use_gpu=0 \
-      --port=$PADDLE_INIT_PORT \
-      --ports_num=$PADDLE_INIT_PORTS_NUM \
-      --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
-      --nics=$PADDLE_INIT_NICS \
-      --comment=paddle_process_k8s \
-      --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
-}
-start_new_pserver() {
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
-  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
-  stdbuf -oL /usr/bin/pserver \
-    -port=$PADDLE_INIT_PORT \
-    -num-pservers=$PSERVERS \
-    -log-level=debug \
-    -etcd-endpoint=http://$MASTER_IP:2379
-}
-start_master() {
-  stdbuf -oL /usr/bin/master \
-  -port=8080 \
-  -chunk-per-task=1\
-  -task-timout-dur=16s\
-  -endpoints=http://127.0.0.1:2379
-}
-check_failed_cnt() {
-  max_failed=$1
-  failed_count=$(python /root/k8s_tools.py count_pods_by_phase paddle-job=${PADDLE_JOB_NAME} Failed) 
-  if [ $failed_count -gt $max_failed ]; then
-    stdbuf -oL echo "Failed trainer count beyond the threadhold: "$max_failed
-    echo "Failed trainer count beyond the threshold: " $max_failed > /dev/termination-log 
-    exit 0
-  fi
-}
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-start_fluid_process() {
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
-  if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
-    check_failed_cnt ${TRAINERS}
-    sleep 5
-    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
-  fi
-  export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
-  stdbuf -oL sh -c "${ENTRY}"
-  check_trainer_ret $?
-}
-start_new_trainer() {
-  # FIXME(Yancey1989): use command-line interface to configure the max failed count
-  check_failed_cnt ${TRAINERS}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
-  sleep 5
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
-  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
-  export ETCD_IP="$MASTER_IP"
-  # NOTE: $TRAINER_PACKAGE may be large, do not copy
-  export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
-  cd $TRAINER_PACKAGE
-  stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
-  $PADDLE_INIT_NUM_GRADIENT_SERVERS, "version: " $1 
-  stdbuf -oL sh -c "${ENTRY}"
-  check_trainer_ret $?
-}
-start_trainer() {
-    # paddle v1 and V2 distributed training does not allow any trainer failed. 
-    check_failed_cnt 0
-    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
-    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job=${PADDLE_JOB_NAME} ${TRAINERS}
-    export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
-    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
-    stdbuf -oL echo $PADDLE_INIT_TRAINER_ID > /trainer_id
-    # FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS
-    stdbuf -oL echo $PADDLE_INIT_NUM_GRADIENT_SERVERS > /trainer_count
-    # NOTE: $TRAINER_PACKAGE may be large, do not copy
-    export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
-    cd $TRAINER_PACKAGE
-    stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
-    $PADDLE_INIT_NUM_GRADIENT_SERVERS, "trainer_id: " $PADDLE_INIT_TRAINER_ID, \
-    "version: " $1
-    # FIXME: If we use the new PServer by Golang, add Kubernetes healthz
-    # to wait PServer process get ready.Now only sleep 20 seconds.
-    sleep 20
-    case "$1" in
-      "v1")
-        FILE_COUNT=$(wc -l $TRAIN_LIST | awk '{print $1}')
-        if [ $FILE_COUNT -le $PADDLE_INIT_NUM_GRADIENT_SERVERS ]; then
-          echo "file count less than trainers"
-          check_trainer_ret 0
-        fi
-        let lines_per_node="$FILE_COUNT / ($PADDLE_INIT_NUM_GRADIENT_SERVERS + 1)"
-        echo "spliting file to" $lines_per_node
-        cp $TRAIN_LIST /
-        cd /
-        split -l $lines_per_node -d -a 3 $TRAIN_LIST train.list
-        CURRENT_LIST=$(printf "train.list%03d" $PADDLE_INIT_TRAINER_ID)
-        # always use /train.list for paddle v1 for each node.
-        echo "File for current node ${CURRENT_LIST}"
-        sleep 10
-        cp $CURRENT_LIST train.list
-        cd $TRAINER_PACKAGE
-        stdbuf -oL  paddle train \
-          --port=$PADDLE_INIT_PORT \
-          --nics=$PADDLE_INIT_NICS \
-          --ports_num=$PADDLE_INIT_PORTS_NUM \
-          --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
-          --num_passes=$PADDLE_INIT_NUM_PASSES \
-          --trainer_count=$PADDLE_INIT_TRAINER_COUNT \
-          --saving_period=1 \
-          --log_period=20 \
-          --local=0 \
-          --rdma_tcp=tcp \
-          --config=$TOPOLOGY \
-          --use_gpu=$PADDLE_INIT_USE_GPU \
-          --trainer_id=$PADDLE_INIT_TRAINER_ID \
-          --save_dir=$OUTPUT \
-          --pservers=$PADDLE_INIT_PSERVERS \
-          --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
-        # paddle v1 API does not allow any trainer failed.
-        check_trainer_ret $? 
-        ;;
-      "v2")
-        stdbuf -oL sh -c "${ENTRY}"
-        # paddle v2 API does not allow any trainer failed.
-        check_trainer_ret $? 
-        ;;
-      *)
-        ;;
-    esac
-}
-usage() {
-    echo "usage: paddle_k8s [<args>]:"
-    echo "  start_trainer  [v1|v2]    Start a trainer process with v1 or v2 API"
-    echo "  start_pserver             Start a pserver process"
-    echo "  start_new_pserver         Start a new pserver process"
-    echo "  start_new_trainer         Start a new triner process"
-}
-case "$1" in
-    start_pserver)
-        start_pserver
-        ;;
-    start_trainer)
-        start_trainer $2
-        ;;
-    start_new_trainer)
-        start_new_trainer
-        ;;
-    start_new_pserver)
-        start_new_pserver
-        ;;
-    start_master)
-        start_master
-        ;;
-    start_fluid)
-        start_fluid_process
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
--- a/benchmark/cluster/vgg16/reader.py
+++ b/benchmark/cluster/vgg16/reader.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.v2 as paddle
-paddle.dataset.cifar.train10()
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -38,7 +38,7 @@ spec:
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "2"
+          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE

--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -51,7 +51,7 @@ def vgg(input, nums, class_dim):
    conv4 = conv_block(conv3, 512, nums[3])
    conv5 = conv_block(conv4, 512, nums[4])
-    fc_dim = 4096
+    fc_dim = 512
    fc1 = paddle.layer.fc(input=conv5,
                          size=fc_dim,
                          act=paddle.activation.Relu(),