提交 cb34f6a2 编写于 作者: T typhoonzero

update fluid vgg16 and add readme

上级 d3905fbc
...@@ -3,10 +3,13 @@ ...@@ -3,10 +3,13 @@
#ADD reader.py /workspace/ #ADD reader.py /workspace/
#RUN python /workspace/reader.py #RUN python /workspace/reader.py
FROM python:2.7.14 FROM python:2.7.14
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ADD paddle_k8s /usr/bin ADD paddle_k8s /usr/bin
ADD k8s_tools.py /root ADD k8s_tools.py /root
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD reader.py /workspace/
RUN python /workspace/reader.py
ADD vgg16.py /workspace/ ADD vgg16.py /workspace/
# Fluid distributed training perf test
## Steps to get started
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable verbos logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
\ No newline at end of file
...@@ -61,7 +61,6 @@ start_fluid_process() { ...@@ -61,7 +61,6 @@ start_fluid_process() {
if [ "${TRAINING_ROLE}" == "TRAINER" ]; then if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
check_failed_cnt ${TRAINERS} check_failed_cnt ${TRAINERS}
sleep 5 sleep 5
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-master=${PADDLE_JOB_NAME} 1
export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id) export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
fi fi
export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips) export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
......
...@@ -33,7 +33,7 @@ spec: ...@@ -33,7 +33,7 @@ spec:
- name: TOPOLOGY - name: TOPOLOGY
value: "" value: ""
- name: ENTRY - name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0" value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
- name: TRAINER_PACKAGE - name: TRAINER_PACKAGE
value: "/workspace" value: "/workspace"
- name: PADDLE_INIT_PORT - name: PADDLE_INIT_PORT
......
...@@ -17,7 +17,7 @@ spec: ...@@ -17,7 +17,7 @@ spec:
- name: trainer - name: trainer
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid" image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
imagePullPolicy: Always imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"] command: ["paddle_k8s", "start_fluid"]
env: env:
- name: PADDLE_JOB_NAME - name: PADDLE_JOB_NAME
value: vgg16job value: vgg16job
...@@ -30,7 +30,7 @@ spec: ...@@ -30,7 +30,7 @@ spec:
- name: TOPOLOGY - name: TOPOLOGY
value: "" value: ""
- name: ENTRY - name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py" value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
- name: TRAINER_PACKAGE - name: TRAINER_PACKAGE
value: "/workspace" value: "/workspace"
- name: PADDLE_INIT_PORT - name: PADDLE_INIT_PORT
......
...@@ -140,12 +140,14 @@ def main(): ...@@ -140,12 +140,14 @@ def main():
def train_loop(exe, trainer_prog): def train_loop(exe, trainer_prog):
iters = 0 iters = 0
ts = time.time()
for pass_id in range(args.num_passes): for pass_id in range(args.num_passes):
# train # train
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
accuracy.reset(exe) accuracy.reset(exe)
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
ts = time.time()
img_data = np.array(map(lambda x: x[0].reshape(data_shape), img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32") data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = np.array(map(lambda x: x[1], data)).astype("int64")
...@@ -158,8 +160,8 @@ def main(): ...@@ -158,8 +160,8 @@ def main():
iters += 1 iters += 1
num_samples += len(data) num_samples += len(data)
print( print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" % "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" %
(pass_id, iters, loss, acc) (pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册