提交 cb34f6a2 编写于 作者: T typhoonzero

update fluid vgg16 and add readme

上级 d3905fbc
......@@ -3,10 +3,13 @@
#ADD reader.py /workspace/
#RUN python /workspace/reader.py
FROM python:2.7.14
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ADD paddle_k8s /usr/bin
ADD k8s_tools.py /root
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD reader.py /workspace/
RUN python /workspace/reader.py
ADD vgg16.py /workspace/
# Fluid distributed training perf test
## Steps to get started
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable verbos logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
\ No newline at end of file
......@@ -61,7 +61,6 @@ start_fluid_process() {
if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
check_failed_cnt ${TRAINERS}
sleep 5
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-master=${PADDLE_JOB_NAME} 1
export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
fi
export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
......
......@@ -33,7 +33,7 @@ spec:
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
......
......@@ -17,7 +17,7 @@ spec:
- name: trainer
image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
......@@ -30,7 +30,7 @@ spec:
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
......
......@@ -140,12 +140,14 @@ def main():
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
accuracy.reset(exe)
for batch_id, data in enumerate(train_reader()):
ts = time.time()
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
......@@ -158,8 +160,8 @@ def main():
iters += 1
num_samples += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc)
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" %
(pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册