提交 632108dc 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_add_random_crop_op

......@@ -34,6 +34,14 @@ repos:
entry: bash ./tools/codestyle/cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
- repo: local
hooks:
- id: pylint-doc-string
name: pylint
description: Check python docstring style using docstring_checker.
entry: bash ./tools/codestyle/pylint_pre_commit.hook
language: system
files: \.(py)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
......
......@@ -18,6 +18,8 @@ env:
addons:
ssh_known_hosts: 13.229.163.131
before_install:
# For pylint dockstring checker
- sudo pip install pylint pytest astroid isort
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
......
......@@ -57,7 +57,10 @@ option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......@@ -202,7 +205,7 @@ endif(USE_NNPACK)
add_subdirectory(proto)
if(NOT MOBILE_INFERENCE)
if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
# "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer.
add_subdirectory(paddle/optimizer)
......@@ -230,3 +233,7 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
if (WITH_CONTRIB)
add_subdirectory(paddle/contrib)
endif()
......@@ -79,6 +79,9 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python
#For docstring checker
RUN pip install pylint pytest astroid isort
COPY ./python/requirements.txt /root/
RUN pip install -r /root/requirements.txt
......@@ -101,6 +104,3 @@ RUN echo 'root:root' | chpasswd
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
EXPOSE 22
# development image default do build work
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
......@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
# Cluster Training Benchmark
## Setup
- Platform
- Kubernetes: v1.6.2
- Linux Kernel: v3.10.0
- Resource
- CPU: 10 Cores per Pod
- Memory: 5GB per Pod
- Docker Image
We use different base Docker Image to run the benchmark on Kubernetes:
- PaddlePaddle v2: paddlepaddle/paddle:0.11.0
- PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
- TensorFlow: tensorflow/tensorflow:1.5.0-rc0
- Model
vgg16 is used in this benchmark.
## Cases
- Variable
- Batch Size of training data.
- PServer count of the training job.
- The number of trainers.
- Invariant
- The resource of trainer/pserver Pod.
### Measure the Performance for Different Batch Size
- PServer Count: 40
- Trainer Count: 100
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure the Performance for Different PServer Count
- Trainer Count: 100
- Batch Size: 64
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>10</th>
<th>20</th>
<th>40 </th>
<th>60</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure Parallel Efficiency By Increasing Trainer Count
- PServer Count: 20
- Batch Size: 64
- Metrics:
$S = \div(T1, TN)$
which S is the ratio of T1 over TN, training time of 1 and N trainers.
The parallel efficiency is:
$E = \div(S, N)$
<table>
<thead>
<tr>
<th>Trainer Counter </th>
<th>1</th>
<th>10</th>
<th>20 </th>
<th>30</th>
<th>40</th>
<th>50</th>
<th>60 </th>
<th>70</th>
<th>80</th>
<th>90</th>
<th>100 </th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
## Reproduce the benchmark
TODO
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
# you can get mirror list here:
# https://launchpad.net/ubuntu/+archivemirrors
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
RUN pip install -U kubernetes opencv-python
RUN pip install paddlepaddle
# if network is slowly, you may need to add proxy here.
# ENV https_proxy=
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# unset proxy if it is setted.
# ENV https_proxy=""
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
# tf k8s
RUN pip install tensorflow==1.4.0
ADD tf_k8s /usr/bin
RUN chmod +x /usr/bin/tf_k8s
ADD vgg16_tf.py /workspace/
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Blas settings
Setting environment variable: `MKL_NUM_THREADS=1`.
### Single Node Single Thread
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 15.44 </td>
<td> 16.32 </td>
<td> 16.74 </td>
<td> 16.79 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 15.97 </td>
<td> 17.04 </td>
<td> 17.60 </td>
<td> 17.83 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> 9.09 </td>
<td> 9.10 </td>
<td> 9.24 </td>
<td> 8.66 </td>
</tr>
</tbody>
</table>
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 190.20 </td>
<td> 222.15 </td>
<td> 247.40 </td>
<td> 258.18 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 170.96 </td>
<td> 233.71 </td>
<td> 256.14 </td>
<td> 329.23 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Trainer Count </th>
<th>20</th>
<th>40</th>
<th>80</th>
<th>100</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 263.29 (78.64%) </td>
<td> 518.80 (77.47%) </td>
<td> 836.26 (62.44%) </td>
<td> 1019.29 (60.89%) </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 326.85 (92.85%) </td>
<td> 534.58 (75.93%) </td>
<td> 853.30 (60.60%) </td>
<td> 1041.99 (59.20%) </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>3</th>
<th>6</th>
<th>10</th>
<th>20</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid(should fix in next PR) </td>
<td> 589.1 </td>
<td> 592.6 </td>
<td> 656.4 </td>
<td> 655.8 </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 593.4 </td>
<td> 791.3 </td>
<td> 729.7 </td>
<td> 821.7 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
#!/bin/bash
# Update to point to the source file.
VGG_SRC="vgg16_fluid.py"
export TRAINING_ROLE=PSERVER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
# Need to wait for the ps to start first.
sleep 10
echo "done start ps"
export TRAINING_ROLE=TRAINER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
#!/bin/bash
check_trainer_ret() {
ret=$1
stdbuf -oL echo "job returned $ret...setting pod return message..."
stdbuf -oL echo "==============================="
if [ $ret -eq 136 ] ; then
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
elif [ $ret -eq 139 ] ; then
echo "Segmentation Fault" > /dev/termination-log
elif [ $ret -eq 1 ] ; then
echo "General Error" > /dev/termination-log
elif [ $ret -eq 134 ] ; then
echo "Program Abort" > /dev/termination-log
fi
stdbuf -oL echo "termination log wroted..."
exit $ret
}
g_pservers=""
g_trainers=""
wait_running_pods(){
pserver_label="tf-job-pserver=${JOB_NAME}"
trainer_label="tf-job-trainer=${JOB_NAME}"
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
}
start_tf_pserver(){
wait_running_pods
label="tf-job-pserver=${JOB_NAME}"
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
}
start_tf_trainer(){
wait_running_pods
label="tf-job-trainer=${JOB_NAME}"
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
check_trainer_ret $?
}
start_tf(){
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
start_tf_trainer
else
start_tf_pserver
fi
}
usage() {
echo "usage: tf_k8s [<args>]:"
echo " start_tf Start tensorflow jobs"
}
case "$1" in
start_tf)
start_tf
;;
--help)
usage
;;
*)
usage
;;
esac
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-tf-pserver
spec:
replicas: 10
template:
metadata:
labels:
tf-job-pserver: vgg16job-tf
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: ENTRY
value: "python vgg16_tf.py"
- name: JOB_NAME
value: vgg16job-tf
- name: PSERVERS_NUM
value: "10"
- name: TF_JOB_NAME
value: "ps"
- name: TRAINERS_NUM
value: "20"
- name: BATCH_SIZE
value: "128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-tf-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
tf-job-trainer: vgg16job-tf
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: JOB_NAME
value: vgg16job-tf
- name: TF_JOB_NAME
value: "worker"
- name: ENTRY
value: "python vgg16_tf.py"
- name: PSERVERS_NUM
value: "10"
- name: BATCH_SIZE
value: "128"
- name: TRAINERS_NUM
value: "20"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
import sys
import time
import numpy as np
import paddle.v2 as paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
import argparse
import functools
import os
from paddle.fluid import debuger
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=16, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument('--device_id', type=int, default=0, help="The device id.")
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--data_set',
type=str,
default='flowers',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--local',
type=str2bool,
default=True,
help='Whether to run as local mode.')
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--trainer_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--profile", action='store_true', help="If set, profile a few steps.")
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index", type=int, default=0, help="Index of task within the job")
args = parser.parse_args()
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2
def main():
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
data_shape = [3, 32, 32]
else:
data_shape = [32, 32, 3]
else:
classdim = 102
if args.data_format == 'NCHW':
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
# Input data
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(batch_acc)
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
# Initialize executor
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
args.device_id)
exe = fluid.Executor(place)
# test
def test(exe):
test_pass_acc = fluid.average.WeightedAverage()
for batch_id, data in enumerate(test_reader()):
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
outs = exe.run(inference_program,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[batch_acc, batch_size])
test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
return test_pass_acc.eval()
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
train_pass_acc = fluid.average.WeightedAverage()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
train_pass_acc.reset()
def run_step(batch_id, data):
img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc, b_size = exe.run(
trainer_prog,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size])
return loss, acc, b_size
if args.profile:
with profiler.profiler('All', 'total',
'/tmp/profile_vgg_%d' % args.task_index):
for batch_id, data in enumerate(train_reader()):
if batch_id > 5: break
run_step(batch_id, data)
total_time = 0.0
count = 0
for batch_id, data in enumerate(train_reader()):
ts = time.time()
loss, acc, b_size = run_step(batch_id, data)
iters += 1
num_samples += len(data)
train_pass_acc.add(value=acc, weight=b_size)
duration = time.time() - ts
total_time += duration
count += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
"Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
len(data) / duration,
count / total_time)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
pass_train_acc = train_pass_acc.eval()
pass_test_acc = test(exe)
print("Task:%d Pass = %d, Training performance = %f imgs/s, "
"Train accuracy = %f, Test accuracy = %f\n" %
(args.task_index, pass_id, num_samples / pass_elapsed,
pass_train_acc, pass_test_acc))
if args.local:
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program())
else:
trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=args.task_index,
pservers=args.ps_hosts,
trainers=trainers)
if training_role == "PSERVER":
current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
"PADDLE_INIT_PORT")
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
paddle.dataset.flowers.test(),
batch_size=args.batch_size)
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
# TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
exe.run(fluid.default_startup_program())
train_loop(exe, trainer_prog)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
print_arguments()
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in TensorFlow
You can get distribution example template structure here:
https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
https://www.tensorflow.org/deploy/distributed
"""
import tensorflow as tf
import paddle.v2 as paddle
import numpy as np
import argparse
import time
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument(
'--data_format',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, NCHW=[batch, channels, height, width].'
'Only support NHWC right now.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--worker_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--job_name", type=str, default="", help="One of 'worker', 'ps'")
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index", type=int, default=0, help="Index of task within the job")
args = parser.parse_args()
class VGG16Model(object):
def __init__(self):
self.parameters = []
def batch_norm_relu(self, inputs, is_training):
"""Performs a batch normalization followed by a ReLU."""
# We set fused=True for a significant speed boost. See
# https://www.tensorflow.org/speed/speed_guide#common_fused_ops
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if args.data_format == 'NCHW' else -1,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
inputs = tf.nn.relu(inputs)
return inputs
def conv_bn_layer(self,
name,
images,
kernel_shape,
is_training,
drop_rate=0.0):
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
kernel_shape, dtype=tf.float32, stddev=1e-1),
name='weights')
conv = tf.nn.conv2d(
images,
kernel, [1, 1, 1, 1],
data_format=args.data_format,
padding='SAME')
biases = tf.Variable(
tf.constant(
0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(conv, biases)
out = self.batch_norm_relu(out, is_training)
out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
return out
def fc_layer(self, name, inputs, shape):
with tf.name_scope(name) as scope:
fc_w = tf.Variable(
tf.truncated_normal(
shape, dtype=tf.float32, stddev=1e-1),
name='weights')
fc_b = tf.Variable(
tf.constant(
0.0, shape=[shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
return out
def network(self, images, class_dim, is_training):
""" VGG16 model structure.
TODO(kuke): enable this network to support the 'NCHW' data format
"""
# conv1
conv1_1 = self.conv_bn_layer(
'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
conv1_2 = self.conv_bn_layer(
'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
# pool1
pool1 = tf.nn.max_pool(
conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1')
# conv2
conv2_1 = self.conv_bn_layer(
'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
conv2_2 = self.conv_bn_layer(
'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
# pool2
pool2 = tf.nn.max_pool(
conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2')
# conv3
conv3_1 = self.conv_bn_layer(
'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
conv3_2 = self.conv_bn_layer(
'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
conv3_3 = self.conv_bn_layer(
'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
# pool3
pool3 = tf.nn.max_pool(
conv3_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool3')
# conv4
conv4_1 = self.conv_bn_layer(
'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
conv4_2 = self.conv_bn_layer(
'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv4_3 = self.conv_bn_layer(
'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool4
pool4 = tf.nn.max_pool(
conv4_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# conv5
conv5_1 = self.conv_bn_layer(
'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_2 = self.conv_bn_layer(
'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_3 = self.conv_bn_layer(
'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool5
pool5 = tf.nn.max_pool(
conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# flatten
shape = int(np.prod(pool5.get_shape()[1:]))
pool5_flat = tf.reshape(pool5, [-1, shape])
# fc1
drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
fc1 = self.fc_layer('fc1', drop, [shape, 512])
# fc2
bn = self.batch_norm_relu(fc1, is_training)
drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
fc2 = self.fc_layer('fc2', drop, [512, 512])
fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
return fc3
def run_benchmark(cluster_spec, server):
"""Run benchmark on cifar10 or flowers."""
if args.data_set == "cifar10":
class_dim = 10
raw_shape = (3, 32, 32)
dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
None, 3, 32, 32)
else:
class_dim = 102
raw_shape = (3, 224, 224)
dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
None, 3, 224, 224)
device = tf.train.replica_device_setter(
worker_device="/job:worker/task:{}".format(args.task_index),
cluster=cluster_spec)
with tf.device(device):
images = tf.placeholder(tf.float32, shape=dat_shape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
vgg16 = VGG16Model()
logits = vgg16.network(images, class_dim, is_training)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
avg_loss = tf.reduce_mean(loss)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
global_step = tf.Variable(0, name='global_step', trainable=False)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_loss, global_step=global_step)
summary_op = tf.summary.merge_all()
init_op = tf.global_variables_initializer()
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
buf_size=5120),
batch_size=args.batch_size)
# test
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
return np.mean(test_accs)
config = tf.ConfigProto(
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1,
log_device_placement=True)
config.gpu_options.allow_growth = True
hooks = [tf.train.StopAtStepHook(last_step=1000000)]
with tf.train.MonitoredTrainingSession(
master=server.target,
is_chief=(args.task_index == 0),
hooks=hooks,
config=config) as sess:
iters, num_samples, start_time = 0, 0, 0.0
for pass_id in range(args.num_passes):
# train
num_samples = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
train_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
train_labels = np.array(map(lambda x: x[1], data)).astype(
'int64')
iter_begin_time = time.time()
_, loss, acc = sess.run([train_op, avg_loss, accuracy],
feed_dict={
images: train_images,
labels: train_labels,
is_training: True
})
iters += 1
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
% (pass_id, iters, loss, acc,
len(data) / (time.time() - iter_begin_time)))
num_samples += len(data)
train_elapsed = time.time() - start_time
# test
pass_test_acc = test()
print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, pass_test_acc))
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
print_arguments()
ps_hosts = args.ps_hosts.split(",")
worker_hosts = args.worker_hosts.split(",")
# Create a cluster from the parameter server and worker hosts.
cluster_spec = tf.train.ClusterSpec({
"ps": ps_hosts,
"worker": worker_hosts
})
# Create and start a server for the local task.
server = tf.train.Server(
cluster_spec, job_name=args.job_name, task_index=args.task_index)
if args.job_name == "ps":
print("start pserver")
server.join()
elif args.job_name == "worker":
print("start worker")
run_benchmark(cluster_spec, server)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()
......@@ -24,22 +24,22 @@ Currently supported `--model` argument include:
* Run the following command to start a benchmark job locally:
```bash
python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test
python fluid_benchmark.py --model mnist --device GPU
```
You can choose to use GPU/CPU training. With GPU training, you can specify
`--parallel 1` to run multi GPU training.
`--gpus <gpu_num>` to run multi GPU training.
* Run distributed training with parameter servers:
* start parameter servers:
```bash
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
```
* start trainers:
```bash
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
```
* Run distributed training using NCCL2
```bash
PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2
PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
```
## Run Distributed Benchmark on Kubernetes Cluster
......@@ -48,7 +48,7 @@ We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submi
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
```
Then the yaml files are generated under directory `myjob`, you can run:
......
......@@ -94,6 +94,10 @@ def parse_args():
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument(
'--update_method',
type=str,
......@@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe.run(train_prog)
return
if args.use_fake_data:
raise Exception(
"fake data is not supported in single GPU test for now.")
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_prog)
......@@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_acc, args, train_prog, startup_prog, nccl_id_var,
num_trainers, trainer_id):
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
# generate fake:
if args.use_fake_data:
for var in feed_var_list:
v = startup_prog.global_block().clone_variable(var)
var.persistable = True
v.persistable = True
real_shape = list(var.shape)
real_shape[0] = args.batch_size / args.gpus
startup_prog.global_block().append_op(
outputs={"Out": v},
type="fill_constant",
attrs={"shape": real_shape,
"value": 1.0,
"dtype": var.dtype})
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
if nccl_id_var and trainer_id == 0:
#FIXME(wuyi): wait other trainer to start listening
time.sleep(30)
startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy()
......@@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
exec_strategy=strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in range(args.pass_num):
num_samples = 0
......@@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_samples = 0
if iters == args.iterations:
break
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.use_fake_data:
loss, = exe.run([avg_loss.name])
else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
num_samples += len(data)
......
......@@ -112,6 +112,7 @@ def gen_job():
envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
envs.append({
......
......@@ -54,5 +54,13 @@ envs = [
"fieldPath": "status.podIP"
}
}
},
{
"name": "PADDLE_CURRENT_IP",
"valueFrom": {
"fieldRef": {
"fieldPath": "status.podIP"
}
}
}
]
......@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(EIGEN_USE_THREADS)
add_definitions(-DEIGEN_USE_THREADS)
endif(EIGEN_USE_THREADS)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
......
......@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
${OPTIONAL_ARGS}
-Dprotobuf_BUILD_TESTS=OFF
-DCMAKE_SKIP_RPATH=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
......
......@@ -1003,9 +1003,9 @@ dice_loss
.. autofunction:: paddle.fluid.layers.dice_loss
:noindex:
bilinear_interp
upsampling_bilinear2d
____
.. autofunction:: paddle.fluid.layers.bilinear_interp
.. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
:noindex:
# Paddle Fluid 开发者指南
---
### ==1==. 为什么需要 PaddlePaddle Fluid?
---
### 两个基础问题
<font size=6>
1. 如何描述机器学习模型和优化过程?
- 完备自洽,表达能力足以支持潜在出现的各种计算需求
1. 如何充分利用资源高效计算?
- 支持异步设备、多卡、分布式计算
- 降低计算/计算优化的开发成本
- ……
</font>
---
### 如何描述模型和优化过程?
<font size=6>
<table>
<thead>
<tr>
<th> </th>
<th>一组连续执行的layers</th>
<th>variable和operator构成的计算图 </th>
<th>不再有模型的概念 </th>
</tr>
</thead>
<tbody>
<tr>
<td> 2013</td>
<td> Caffe,Theano, Torch, PaddlePaddle </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> 2015 </td>
<td> </td>
<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
<td> </td>
</tr>
<tr>
<td>2016 </td>
<td> </td>
<td> </td>
<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
</tr>
</tbody>
</table>
---
### <p align="center">目标 </p>
<font size=6>
- 提高对各类机器学习任务的描述能力:能够描述潜在出现的任意机器学习模型。
- 代码结构逻辑清晰,各模块充分解耦:内外部贡献者能够专注于自己所需的功能模块,基于框架进行再次开发。
- 从设计上,留下技术优化的空间和潜力。
- 代码解耦后降低多设备支持、计算优化等的开发成本。
- 在统一的设计理念下,实现自动可伸缩,自动容错的分布式计算。
</font>
---
## ==2.== Design Overview
---
# Fluid: 系统形态
- <span style="background-color:#ACD6FF;">[编译器式的执行流程,区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
<br>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_compiler.png" width=100%>
</p>
---
#### 让我们在Fluid程序实例中,区分编译时和运行时
---
### Fluid 编译时
<font size=5>
- ==**定义前向计算**==
```python
x = fluid.layers.data(name='x',shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(x=cost)
```
- ==**添加反向、正则、优化**==
```python
learning_rate = 0.01
sgd_optimizer = fluid.optimizer.SGD(learning_rate)
sgd_optimizer.minimize(avg_cost)
```
</font>
---
### `Program` vs. 计算图
<font size=5>
- 在科学计算领域,计算图是一种描述计算的经典方式。下图展示了从前向计算图(蓝色)开始,通过添加反向(红色)和优化算法相关(绿色)操作,构建出整个计算图的过程:
-
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
</p>
- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program``Block``Operator``Variable`构成,相关概念会在后文详细展开。
- 编译时 Fluid 接受前向计算(这里可以先简单的理解为是一段有序的计算流)`Program`,为这段前向计算按照:前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序,添加相关 `Operator``Variable``Program`到完整的计算。
</font>
---
### Fluid 运行时
<font size=5>
- ==**读入数据**==
```python
train_reader = paddle.batch(
paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=20)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
```
- ==**定义执行程序的设备**==
```python
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
```
- ==创建执行器(Executor),执行初始化 `Program`和训练`Program`==
```python
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
print(avg_loss_value)
```
</font>
---
### 总结:框架做什么?用户做什么?
<br>
<font size=5>
<table>
<thead>
<tr>
<th>构建训练</th>
<th>执行训练</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<span style="background-color:#B3D9D9">用户</span>:描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>:添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>:添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>:添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>:添加并行/多设备/分布式相关的计算单元
</td>
<td>
<span style="background-color:#DAB1D5;">框架</span>:创建Operator(计算)+ Variable(数据)<br><span style="background-color:#DAB1D5;">框架</span>:创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>:内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>:执行计算
</td>
</tr>
</tbody>
</table>
</font>
---
### <p align="center">总结:编译时</p>
<font size=5>
<span style="background-color:#A3D1D1;">**用户编写一段Python程序,描述模型的前向计算**</span>
1. 创建变量描述 `VarDesc`
1. 创建operators的描述 `OpDesc`
1. 创建operators的属性
1. 推断变量的类型和形状,进行静态检查:`inferShape`
1. 规划变量的内存复用
1. 创建反向计算
1. 添加优化相关的Operators
1. (可选)添加多卡/多机相关的Operator,生成在多卡/多机上运行的程序
</font>
---
### <p align="center">总结:运行时</p>
<font size=5>
<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
1. 创建`Executor`
1. 为将要执行的一段计算,在层级式的`Scope`空间中创建`Scope`
1. 创建`Block`,依次执行`Block`
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
<font size=3> Figure. 编译时运行时概览</font>
</p>
</font>
---
<!-- *template: invert -->
## ==3==. 用户如何描述计算?
---
### Fluid:==像写程序一样==定义计算
<font size=5>
- 顺序执行
```python
x = fluid.layers.data(name='x',shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
```
- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
```python
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
fluid.print("Case 2")
with switch.default():
fluid.print("Case 3")
```
>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
</font>
---
### Fluid: ==像写程序一样==定义计算
<font size=5>
- 循环:[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
```python
d0 = layers.data("d0", shape=[10], dtype='float32')
data_array = layers.array_write(x=d0, i=i)
array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
cond = layers.less_than(x=i, y=array_len)
while_op = layers.While(cond=cond)
with while_op.block():
d = layers.array_read(array=data_array, i=i)
i = layers.increment(x=i, in_place=True)
layers.array_write(result, i=i, array=d)
layers.less_than(x=i, y=array_len, cond=cond)
```
- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
- beam search [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
</font>
---
#### <p align="center">总结</p>
<font size=5>
1. 用户层提供的描述语法具有完备性、自洽性,有能力支持对复杂计算过程描述
1. 使用方式和核心概念可以类比编程语言,认知能够直接迁移
1. 能够支持:定义问题,逐步求解
</font>
---
## ==3.== 核心概念
---
### 编译时概念 :==变量和计算的描述==
<font size=5>
- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
- https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
- 在Fluid中,一个神经网络任务(训练/预测)被描述为一段`Program`
- `Program`包含对`Variable`(数据)和 `Operator`(对数据的操作)的描述
- `Variable``Operator` 被组织为多个可以嵌套的`Block`,构成一段完整的`Fluid Program`
>编译阶段最终,经过 Transpiler 的执行规划,变换处理,生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
</font>
---
### 编译时概念 :==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
<font size=5>
1. 接受一段`ProgramDesc`作为输入,生成一段新的`ProgramDesc`
- *Memory optimization transpiler*:向原始`ProgramDesc` 中插入 `FreeMemoryOps`,在一次迭代优化结束前提前释放内存,使得能够维持较小的 memory footprint
- *Distributed training transpiler*:将原始的`ProgramDesc`中转化为对应的分布式版本,生成两段新的`ProgramDesc`:
1. trainer进程执行的`ProgramDesc`
1. parameter server执行的`ProgramDesc`
1. ==**WIP**==: 接受一段`ProgramDesc`,生成可直接被`gcc`, `nvcc`, `icc`等编译的代码,编译后得到可执行文件
</font>
---
### Transplier
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/transpiler.png" width=70%>
</p>
---
### 打印 `ProgramDesc`
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/print_fluid_program.png" width=70%>
</p>
<font size=5>
- `default_startup_program`:创建可学习参数,对参数进行初始化
- `default_main_program`:由用户定义的模型,包括了前向、反向、优化及所有必要的计算
- 打印可读的 `Program`
```python
from paddle.v2.fluid import debuger
print debuger.pprint_program_codes(framework.default_main_program().desc)
```
</font>
---
### 输出效果
<font size=5>
<table>
<thead>
<th>variable in block 0</th>
<th>variable in block 0</th>
</thead>
<tbody>
<tr>
<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc1.png" width=70%></td>
<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc2.png" width=70%></td>
</tr>
</tbody>
</table>
</font>
---
### 运行时概念
<font size=5>
- 数据相关
- `Tensor` / `LoDTensor` / `Variable`
- `Scope`
- 计算相关
- `Block`
- `Kernel``OpWithKernel``OpWithoutKernel`
<table>
<thead>
<th></th>
<th>protobuf messages</th>
<th>C++ class objects</th>
</thead>
<tbody>
<tr>
<td>Data</td>
<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
</td>
<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
</td>
</tr>
<tr>
<td>Operation</td>
<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
</td>
<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
</td>
</tr>
<tr>
<td>Block</td>
<td>BlockDesc
</td>
<td>Block
</td>
</tr>
</tbody>
</table>
- 执行相关 :`Executor`
</font>
---
#### Tensor 和 LoD(Level-of-Detail) Tensor
<font size=5>
- Tensor 是$n$-dimensional arry的推广,LoDTensor是在Tensor基础上附加了序列信息
- Fluid中输入、输出,网络中的可学习参数全部统一使用LoDTensor(n-dimension array)表示
- 一个mini-batch输入数据是一个LoDTensor
- 在Fluid中,RNN 处理变长序列无需padding,得益于 `LoDTensor`表示
- 可以简单将 LoD 理解为:`std::vector<std::vector<int>>`
- 对非序列数据,LoD 信息为空
<table>
<thead>
<th></th>
<th>TensorFlow</th>
<th>PaddlePaddle</th>
</thead>
<tbody>
<tr>
<td>RNN</td>
<td>Support
</td>
<td>Support
</td>
</tr>
<tr>
<td>recursive RNN</td>
<td>Support
</td>
<td>Support
</td>
</tr>
<tr>
<td>padding zeros</td>
<td>Must
</td>
<td>No need
</td>
<tr>
<td>blob data type</td>
<td>Tensor
</td>
<td>LODTensor
</td>
</tr>
</tbody>
</table>
</font>
---
#### LoD 信息实例
<font size=4>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LoDTensor.png" width=43%>
</p>
- 图(a)的LoD 信息
```cpp
[0, 5, 8, 10, 14]
```
- 图(b)的 LoD 信息
```cpp
[[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
```
</font>
---
#### Tensor, Variable, Scope 之间的关系
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
</p>
<font size=5>
1. `Block` 是一个实现层的概念,不在应用层暴露给用户。目前用户无法自行创建并利用`Block`,用户能够感知的只有`Program`这个概念。
1. 逻辑上,可以将 `Block` 类比为编程语言中的大括号:定义了一段作用域,其中运行一段代码
1. `Executor`会为每一个`Block`创建一个`Scope``Block`是可嵌套的,因此`Scope`也是可嵌套的
</font>
---
### Executor
<font size=5>
<table>
<thead>
<th>接口</th>
<th>说明</th>
</thead>
<tbody>
<tr>
<td><p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/executor.png" width=60%>
</p></td>
<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
</td>
</tr>
</tbody>
</table>
---
### Operator/OpWithKernel/Kernel
<font size=5>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator1.png" width=50%>
</p>
- operator 无状态,Operator的核心是==Run==方法
- 一个operator可以注册多个kernel
- operator 可以无 kernel:while_op 、ifelse op
</font>
---
#### Fluid Operator vs. PaddlePaddle layers
<font size=5>
<table>
<thead>
<th>Layer</th>
<th>Operator</th>
</thead>
<tbody>
<tr>
<td><p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/layer.png" width=70%>
</p></td>
<td><p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator2.png" width=73%>
</p></td>
</tr>
<tr>
<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
<td>1. 内部无状态<br>2. 只有Run方法</td>
</tr>
</tbody>
</table>
</font>
---
### ==4.== 内存管理
---
### 目标
- 为异构设备提供统一的内存分配、回收接口
- 最小化管理内存所需的时间,最小化管理开销
- 减少内存碎片
- 将内存管理与计算(Operators/Kernels)完全剥离
- 统一内存管理是内存优化的基础
---
<font size=5>
### Memory 接口
- 内存管理模块向上层应用逻辑提供三个基础接口:
```cpp
template <typename Place>
void* Alloc(Place place, size_t size);
template <typename Place>
void Free(Place place, void* ptr);
template <typename Place>
size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
};
```
- 模板参数 `Place` 指示内存分配发生的设备
- 实现时,需特化支持的 `Place`, 提供以上三个接口的实现
</font>
---
### 代码结构
<font size=5>
内存管理模块可以理解为由以下两部分构成:
1. SystemAllocator:实际从物理设备上分配、释放的内存的接口
1. BuddyAllocator:内存管理算法
</font>
---
### System Allocator
<font size=5>
- SystemAllocator 是实现物理内存分配、回收的基类
- 不同设备上的内存分配和回收终将转化为标准接口调用
- 为不同设备实现MemoryAllocator,继承自SystemAllocator
```cpp
class SystemAllocator {
public:
virtual ~SystemAllocator() {}
virtual void* Alloc(size_t& index, size_t size) = 0;
virtual void Free(void* p, size_t size, size_t index) = 0;
virtual bool UseGpu() const = 0;
};
```
</font>
---
### CPU/GPU Allocator
<font size=5>
```cpp
class CPUAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
};
#ifdef PADDLE_WITH_CUDA
class GPUAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ = 0;
size_t fallback_alloc_size_ = 0;
};
#endif
```
- CPUAllocator和GPUAllocator分别继承自SystemAllocator,分别调用相应的标准库函数实现物理内存的分配和释放。
- 一旦大块、连续的物理内存分配之后,将通过内存管理算法实现内存的按块分配、回收、重用等。
</font>
---
### CPU Allocator
<font size=5>
- CPU 内存的分配提供两种选项:
1. non-pinned memory:可分页内存
2. pinned memory:页锁定内存
- 分配过大的页锁定内存有可能因为系统可使用的分页内存减少,影响系统性能,默认CPU下分配的是可分页内存
- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
```cpp
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
```
</font>
---
### GPU Allocator
<font size=5>
- 通过 cudaMalloc 分配GPU显存
- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
- 如果可用显存小于请求分配大小,调用cudaMalloc进行分配
- 如果可用显存不足,目前会报错退出。
- 通过gflags控制GPU下一次性分配显存的大小:
```cpp
DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
"Default use 92% of GPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
```
</font>
---
#### 内存管理算法: [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
<font size=5>
- Memory Arena:一次性分配大块连续内存,之后会基于这块内存进行内存管理:动态分配、释放、重用内存块。
- 伙伴内存分配:
- 将内存划分为 2 的幂次方个分区,使用 best-fit 方法来分配内存请求。
- 当释放内存时,检查 buddy 块,查看相邻的内存块是否也已被释放。如果是,将内存块合并,以最小化内存碎片。
- 分配的内存在物理内存的自然边界对齐,提高内存访问效率。
- 算法的时间效率高,单使用 best-fit 方法的缘故,会产生一定的内存浪费
</font>
---
### Buddy Allocator
<font size=5>
- BuddyAllocator 是一个单例,每个设备(如: GPU/CPU(0)/GPU(1)) 拥有一个BuddyAllocator
- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
- 当请求的内存超过BuddyAllocator管理的空余内存时,将会调用SystemAllocator去指定的设备上分配物理内存
</font>
---
### 实例:CPU 下内存管理接口的实现
<font size=5>
- 对上层应用,统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
```cpp
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p;
return p;
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
template <>
size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
return GetCPUBuddyAllocator()->Used();
}
```
</font>
---
### ==5.== 多设备支持
---
### 多设备支持(一)
<font size=5>
- step 1:添加Place类型,<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
- 可以将Place类型理解为一个整数加上一个枚举型,包括:设备号 + 设备类型
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
</p>
- DeviceContext
- 不同的Place会对应一个相应的DeviceContext,用于组织管理与设备相关的信息
- 例如,GpuDeviceContext中会管理Cuda stream
- 目前实现中一些特殊的库也会对应有自己的DeviceContext:例如:
```cpp
class MKLDNNDeviceContext : public CPUDeviceContext {……}
```
- 每种设备对应的DeviceContext需要管理的内容不尽相同,视具体需求来实现
</font>
---
### 多设备支持(二)
<font size=5>
- step 2: 增加KernelType,为相应的KernelType注册Kernel对象,<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照:
1. Place 执行设备
1. DataType 执行数据类型 FP32/FP64/INT32/INT64
1. Memory layout: 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
1. 使用的库
来区分Kernel,为同一个operator注册多个 Kernel。
```cpp
struct OpKernelType {
proto::DataType data_type_;
DataLayout data_layout_;
platform::Place place_;
LibraryType library_type_;
}
```
</font>
---
### 多设备支持(三)
<font size=5>
step 3: 运行时的 KernelType 推断和Kernel切换,<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
- Expected Kernel:期待调用的Kernel:由(1)`Place`和计算精度决定;或(2)用户在配置中显示指定使用的计算库,如`cudnn``mkldnn`等。
- Actual Kernel:运行时从`Operator`的输入(`Variable`)可以推断出实际需要的`KernelType`
- 当Expected Kernel和Actual Kernel不一致的时候,框架会插入`data_transformer`或者`data_layerout_transform`等,保证Expected Kernel可以执行,包括:
- CPUPlace -> GPUPlace :跨设备内存复制
- NCHW -> nChw8c :Layout转换
- FP32 -> FP16 :精度转换 _**尚未支持**_
- ……
- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
</font>
---
## ==6.== while_op
---
### while_op
<font size=5>
- 循环执行一段`Program`,直到条件operator判断循环条件不满足时终止循环
- while_op 的特殊之处:
1. while_op 没有 kernel
1. while_op 拥有自己的`Block`,会形成一段嵌套的`Block`
1. ==while_op 内部创建了一个 Executor,来循环执行`Block`==
- while_op 输入输出 : LoDTensorArray
```cpp
namespace paddle {
namespace framework {
using LoDTensorArray = std::vector<LoDTensor>;
}
}
```
- 每一次循环,从原始输入中“切出”一个片段
- LoDTensorArray 在Python端暴露,是Fluid支持的基础数据结构之一,用户可以直接创建并使用
</font>
---
### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
<font size=5>
```cpp
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program();
auto step_scopes =
scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope);
executor.Run(*program, &current_scope, block->ID(),
false /*create_local_scope*/);
}
}
```
</font>
---
### while_op 的重要应用:Dynamic RNN
---
### 什么是 `dynamicRNN` ?
<font size=5>
<br>
1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据,在其上循环调用用户定义的单步计算
1. 可学习参数在多个时间步之间共享
1. `dynamicRNN``while_op` 实现
1. 如果`dynamicRNN`中定义了`memory`,将会构成一个循环神经网络,否则其行为就等于在输入序列上循环调用预定义的单步计算
</font>
---
#### `dynamic RNN` 用户接口
<font size=5>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/user_interface.png" width=75%>
</p>
- `dynamicRNN` 中的重要元素
1. **step input**: `dynamicRNN` 每个时间步的输入
1. **step function**: 用户定义的单步计算
1. **memory**: 用于形成循环连接
1. **external/static memory**:单步计算的每一步都可以全部读取到的外部输入
</font>
---
#### dynamicRNN 中的 Memory
<font size=5>
`dynamicRNN``memory`的行为非常类似于 C++ 中的引用变量
- `memory` “指向” 一个operator的输出变量,记作: A
- `memory` 可以被 LoDTensor 初始化(当LoD信息为空时,为非序列,否则为序列),默认`memory`被初始化为零
- `memory` 在 operator A 前向计算之后,进行前向计算
-`memory` 的前向计算会 "指向" A 的输出 LoDTensor
- `memory` 的输出可以是另一个 operator 的输入,于是形成了“循环”连接
</font>
---
### DynamicRNN 实现细节
<font size=5>
- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>,必须和一组相关的 operator 及数据结构配合
- 依赖的 operators (这里仅列出最重要的,并非全部):
- `lod_rank_table` operator
- `lod_tensor_to_array` operator
- `array_to_lod_tensor` operator
- `shrink_memory` operator
- 依赖的数据结构
- `TensorArray`
- `LoDRankTable`
- 在Fluid中,RNN接受变长序列输入,无需填充,以上数据结构和相关的operator配合工作,实现了对变长输入以batch计算
</font>
---
### `dynamicRNN` 如何实现 batch 计算 ?
<font size=5>
- 问题:
- RNN 可以看作是一个展开的前向网络,前向网络的深度是最长序列的长度
- 如果不对变长序列进行填充,将它们填充到一样长度,每个mini-batch输入将会不等长,每个样本展开长度不一致,导致前向和反向计算实现困难
</font>
----
##### 实例 :RNN encoder-decoder with attention
<font size=5>
- 以机器翻译的RNN encoder-decoder 模型(涉及了`dynamicRNN`的所有设计要素)为例,下图是 RNN encoder-decoder 的原始输入:
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
</p>
- source word sequences 是encoder RNN的输出,是一个LoDTensor
- target word sequences 是look_uptable的输入,是一个LoDTensor
- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间,表示一个dense vector
</font>
---
### `dynamicRNN` 如何实现 batch 计算 ?
<font size=5>
1. 对一个mini batch中不等长样本进行排序,最长样本变成batch中的第一个,最短样本是batch中最后一个
- `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
- 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
2. 构建每个时间步的batch输入:随着时间步增加,每个时间步的batch输入可能会逐渐缩小
- `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
3. 每个时间步输出写入一个输出 `LoDTensorArray`
3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序,还原会原始输入顺序
- `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
</font>
---
### 运行实例
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sorted_input.png" width=100%>
</p>
---
### 运行实例
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=100%>
</p>
<font size=5>
- 执行到第5~7个batch时,batch size将会缩小
</font>
---
### 运行实例
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=80%>
</p>
<font size=5>
- 第5 ~ 7个batch时RNN的`memory`会发生什么?
- `memory` 指向某个operator的输出Tensor,在该operator前向计算之后,“取回”其计算结果
- 5 ~ 7时,遇到了序列的结束,==下一个时间步计算不再需要在已经结束的序列上展开==
-`dynamicRNN``shrink_memory` operator 用来缩小`memory`的batch输入
</font>
---
### 运行实例:batch 1 ~ 2
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
</p>
---
### 运行实例:batch 3 ~ 4
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
</p>
---
### 运行实例:batch 5 ~ 7
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
</p>
---
### ==7.== Fluid 代码结构
---
### Fluid 代码结构
<table>
<thead>
<tr>
<th>代码结构</th>
<th>模块结构</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
</p>
</td>
<td>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_2.png" width=60%>
</p>
</td>
</tr>
</tbody>
</table>
---
### ==8.== 文档总结
---
<font size=5>
- 设计概览
- 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
- fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
- fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
- 核心概念
- variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
- Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
- LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
- TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
- Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
- Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
- Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
---
- 重要功能模块
- backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
- 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
- evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
- python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
- regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
- 开发指南
- 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
- 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
- 添加新的Kernel [->](
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
</font>
---
### ==9.== 开发指南
---
#### 建议开发环境:使用 Docker 编译和测试
<font size=5>
Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
PaddlePaddle 在 Dockerhub 地址:[->](
https://hub.docker.com/r/paddlepaddle/paddle/tags/)
1. 获取PaddlePaddle的Docker镜像
```bash
docker pull paddlepaddle/paddle:latest-dev
```
1. 启动 docker container
```bash
docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
```
1. 进入docker container后,从源码编译,请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
</font>
---
### 一些说明
<font size=5>
1. PaddlePaddle的Docker镜像为了减小体积,默认没有安装vim,可以在容器中执行`apt-get install -y vim`来安装vim。
1. 开发推荐使用tag为`latest-dev`的镜像,其中打包了所有编译依赖。`latest``lastest-gpu`是production镜像,主要用于运行PaddlePaddle程序。
2. 在Docker中运行GPU程序,推荐使用nvidia-docker,[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
<font size=4>
```bash
nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
```
</font>
</font>
---
### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
<font size=5>
- ==提交PullRequest前请务必阅读==: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
- 代码要求
1. 代码注释遵守 Doxygen 的样式
1. 确保编译器选项 WITH_STYLE_CHECK 已打开,并且编译能通过代码样式检查
1. 所有代码必须具有单元测试,且能够通过所有单元测试
- 使用 `pre-commit` 钩子提交Pull Request
1. 帮助格式化源代码(C++,Python)
1. 在提交前自动检查一些基本事宜:如每个文件只有一个 EOL,Git 中不要添加大文件等
1. 安装pre-commit,并在PaddlePaddle根目录运行:
```bash
➜ pip install pre-commit
➜ pre-commit install
```
</font>
---
### 如何贡献
<font size=5>
1. 开始开发之前请先建立issue。
- 让其它同学知道某项工作已经有人在进行,以避免多人开发同一功能的情况。
1. 提交PR必须关联相关的issue。做法请参考:[->](https://help.github.com/articles/closing-issues-using-keywords/)
- 目的:为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能,为了解决什么样的问题。
- 当PR被merge后,关联的issue会被自动关闭。
1. PR review 中,reviewer的每条comment都必须回复。
- 如修改完可直接回复:Done。
- 目的:review comment 中可能会有(1)询问类型的问题;(2)可以在下一个PR修改的问题;(3)comment意见不合理等。需要明确回复,以便reviewer和其他人有历史可查,便于区分是否已经进行修改,或者准备下一个PR修改,或者意见不合理可以不用进行修改。
</font>
---
### ==10.== 添加新的 Operator
---
### 概念简介
<font size=5>
添加一个新的operator,会涉及实现以下C++类的派生类:
1. `framework::OperatorBase`: Operator(简写,Op)基类。
1. `framework::OpKernel`: Op计算函数的基类,称作Kernel。
1. `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
1. `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
依据是否包含kernel,可以将Op分为两种:
1. 包含Kernel的Op:继承自OperatorWithKernel,==绝大多数operator都属于这一类==
1. 不包含kernel的Op,继承自OperatorBase,只有少量Op属于这一类,例如while_op,ifelse_op
<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
</font>
---
#### 添加新的Operator需要修改/添加哪些文件?
<font size=5>
<table>
<thead>
<tr>
<th>内容</th>
<th>定义位置</th>
</tr>
</thead>
<tbody>
<tr>
<td>
OpProtoMake定义
</td>
<td>
`.cc`文件,<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
</td>
</tr>
<tr>
<td>
Op定义
</td>
<td>
`.cc`文件
</td>
</tr>
<tr>
<td>
Kernel实现
</td>
<td>
<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
</td>
</tr>
<tr>
<td>
注册Op
</td>
<td>
Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
</td>
</tr>
</tbody>
</table>
- 添加 Operator 之前请阅读:[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)
- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc``*_op.cu`(如有)结尾。
- 根据文件名自动构建op和Python端绑定,<span style="background-color:#DAB1D5;">请务必遵守以上命名,否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>
</font>
---
###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
<font size=5>
下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
- clip_op计算公式:$Out = \min(\max(X, min), max)$
- 首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释(<font size=4>*下面代码段的中注释进行了简化,实现时需按照规范添加注释*</font>):
```cpp
template <typename AttrType>
class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X","(Tensor)The input of clip op.");
AddOutput("Out", "(Tensor),The output of clip op.");
AddAttr<AttrType>(
"min", "(float),Minimum value.");
AddAttr<AttrType>(
"max", "(float),Maximum value.");
AddComment(R"DOC(
……
)DOC");
}
};
```
</font>
---
###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
<font size=5>
下面的代码段实现了`clip_op`的定义:
```cpp
class ClipOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ClipOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ClipOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto max = ctx->Attrs().Get<float>("max");
auto min = ctx->Attrs().Get<float>("min");
PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
```
</font>
---
### Operator 类中需要完成的工作
<font size=5>
1. clip_op 继承自`OperatorWithKernel`
```cpp
using framework::OperatorWithKernel::OperatorWithKernel;
```
表示使用基类`OperatorWithKernel`的构造函数。
1. 重写`InferShape`接口。
- `InferShape` 为const函数,不能修改Op的成员变
- `InferShape` 的参数为 `const framework::InferShapeContext &ctx`,从中可获取到输入输出以及属性
- `InferShape` 会被调用两次,一次是编译时(创建op),一次是运行时(调用op的`Run`方法时),需要完成以下功能:
1. 做检查, 尽早报错:检查输入数据维度、类型等是否合法
2. 设置输出Tensor的形状
<span style="background-color:#DAB1D5;">通常`OpProtoMaker``Op`类的定义写在`.cc`文件中。</span>
</font>
---
### 补充说明
<font size=5>
1. `InferShape`目前支持两种实现方式,<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
1. 继承framework::InferShapeBase,实现为一个functor(参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)
2. override InferShape函数(参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)
1. 什么是`functor` ?
- 类或结构体仅重载了`()`,一般是可被多个kernel复用的计算函数。
<font size=4>
```cpp
template <typename T>
class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& ctx,
framework::Tensor* out,
const framework::Tensor* prob,
const framework::Tensor* labels, const bool softLabel) {
……
}
};
```
</font>
- 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法: [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
</font>
---
###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
<font size=5>
- `ClipKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
1. `typename DeviceContext`: 表示设备类型,不同设备共享同一个Kernel时,需添加该模板参数。不共享时,需要提供针对不同设备的特化实现。
1. `typename T` : 表示支持的数据类型,如`float`, `double`
-`ClipKernel`类中重写`Compute`方法
1. `Compute`接受输入参数:`const framework::ExecutionContext& context`
- `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起,使得Op在调用`Compute`方法时,能够简单地通过名字拿到需要的输入输出`Variable`
-`InferShapeContext`相比,`ExecutionContext` 中增加了设备类型
1.`Compute`函数里实现`OpKernel`的具体计算逻辑
</font>
---
#### ClipKernel 代码概览
<font size=5>
```cpp
template <typename DeviceContext, typename T>
class ClipKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto max = context.Attr<T>("max");
auto min = context.Attr<T>("min");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
T* out_data = out->mutable_data<T>(context.GetPlace());
const T* x_data = x->data<T>();
int64_t numel = x->numel();
Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x_data,
x_data + numel, out_data, ClipFunctor<T>(min, max));
}
};
```
- 为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用, Fluid 使用 Eigen 作为基础的矩阵运算库
- Fluid对Eigen unsupported Tensor提供了一些基本的封装,可以在`Compute`接口中直接调用
- 关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)
</font>
---
###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
<font size=5>
- ==**反向Op没有`ProtoMaker`**==,除此之外定义与实现方式前向Op完全一致,不再赘述
- 这里仅对反向Op的输入输出进行说明:
1. 反向Op的输入
- 前向Op的输出
- 反向传播过程中传递给当前Op的梯度
- 需要注意,<span style="background-color:#e1c4c4;">Fluid中,不区分Cost Op和中间层Op,所有Op都必须正确处理接收到的梯度</span>
2. 反向Op的输出
- 对可学习参数的求导结果
- 对所有输入的求导结果
</font>
---
###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
<font size=5>
至此Op和Op kernel都已经实现完毕,接下来,需要在`.cc``cu`文件中注册op和kernel
1.`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
<font size=4>
```cpp
namespace ops = paddle::operators;
REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
ops::ClipOpGrad);
REGISTER_OP_CPU_KERNEL(
clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
```
- 在上面的代码片段中:
1. `REGISTER_OP` : 注册`ops::ClipOp`类,类型名为`clip`,该类的`ProtoMaker`为`ops::ClipOpMaker`,注册`ops::ClipOpGrad`,类型名为`clip_grad`
1. `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op,例如:优化算法相关的Op
1. `REGISTER_OP_CPU_KERNEL` :注册`ops::ClipKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::ClipGradKernel`类
</font>
1. 按照同样方法,在`.cu`文件中注册GPU Kernel
- <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen,需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
</font>
---
##### 编译和Python端绑定
<font size=5>
- 运行下面命令可以仅编译新添加的Op:
```
make mul_op
```
- <span style="background-color:#e1c4c4;">需注意,运行单元测试需要编译整个工程</span>
- 如果遵循前文的文件命名规则,构建过程中,会自动为新增的op添加Python端绑定,并链接到生成的lib库中
</font>
---
###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
<font size=5>
- 新增Op的单元测试统一添加至:[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
- 前向Operator单测
1. Op单元测试继承自`OpTest`,各项具体的单元测试在`TestClipOp`里完成,所有单测case都以`TestXX`命名
1. 单元测试Operator,需要:
1. 在`setUp`函数定义输入、输出,以及相关的属性参数
1. 生成随机的输入数据
1. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比
1. 反向梯度检测流程测试框架已经实现,直接调用相应接口`check_grad`即可
- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py),这里不再展开
</font>
---
#### 编译执行单测
<font size=5>
- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
- <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程,并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
- 编译成功后,执行下面的命令来运行单元测试:
```bash
make test ARGS="-R test_mul_op -V"
```
或者:
```
ctest -R test_mul_op
```
</font>
---
### 添加Op的一些注意事项
<font size=5>
- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc``*_op.cu`(如有)。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>,将会导致编译出错。
- 注册Op时的类型名,需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`</span>,会导致单元测试出错。
- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel,不要创建空的`*_op.cu`</span>,会导致单元测试出错。
- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
</font>
---
### ==10.== 使用相关问题
---
### 定义前向计算
<font size=5>
- 当在python端执行时:
```python
import paddle.v2.fluid as fluid
```
[`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
```python
# program is a global instance.
_main_program_ = Program()
_startup_program_ = Program()
```
- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
- 如果需要执行一个新的`mian_program`时,可以调用调用:
```python
def switch_main_program(program):
"""
Switch the main program to a new program.
This funtion returns the previous main program.
"""
……
```
</font>
---
### 自定义参数的初始化
<font size=5>
- 调用`fluid.ParamAttr(……)`接口,自定义参数的初始化
```python
w_param_attrs = ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0,
regularizer=L1Decay(1.0),
trainable=True,
clip=GradientClipByValue(-1.0, 1.0),
)
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
```
- 补充问题:如何创建 `Variable`
```python
cur_program = Program()
cur_block = cur_program.current_block()
new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
```
</font>
---
### 添加反向Op
<font size=5>
- 调用`fluid.backward.append_backward(X)``X`是一个Variable),来为一段前向`ProgramDesc`添加反Op
```python
data = fluid.layers.data(name="data", shape=(2,3,4))
out = fluid.layers.fc(input=data,size=128,act=None)
loss = fluid.layers.reduce_sum(out)
fluid.backward.append_backward(loss=loss)
```
- 添加优化相关的Op
```python
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(loss)
```
- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
- 当构建完成整个`Program`后,调用下面的接口执行内存优化:
```python
fluid.memory_optimize(fluid.default_main_program())
```
- _<span style="background-color:#e1c4c4;">注:内存优化目前仍在持续开发中,有可能不够稳定。</span>_
</font>
---
### 总结:编译时执行流程
<font size=5>
- 用户定义前向计算
- 添加反向Op到`default_main_program`
- 添加 gradient clipping Op 到
- 添加 regularization Op 到`default_main_program`
- 为指定的优化算法,添加相关的状态 variable of optimizer 到`default_startup_program`
- 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
- 添加初始化 variable 的Op 到 `default_startup_program`
- 为整个网络最后一个op,添加设置其接受到的梯度的Op到`default_main_program`
- 进行内存优化规划
</font>
---
### Feed 数据 (一):通过 feed 字典
<font size=5>
- 执行executor的run方法时,指定feed字典,feed op 会将指定的数据放到`x``y`两个Variable中
```python
y_data = np.random.randint(0, 8, [1]).astype("int32")
y_tensor = core.Tensor()
y_tensor.set(y_data, place)
x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
x_tensor = core.Tensor()
x_tensor.set(x_data, place)
……
cost = exe.run(
fluid.default_main_program(),
feed={'x': x_tensor,
'y': y_tensor},
fetchlist=[avg_cost])
```
- 这种方法较为底层,一般用于单测中
</font>
---
### Feed 数据 (二):使用 DataFeeder接口
<font size=5>
- 编写一个data_reader函数,data_reader是一个Python generator
```python
def demo_reader():
def random_generator():
yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
return random_generator
```
- 在训练任务中使用 DataFeeder 接口
```python
cost = exe.run(
fluid.default_main_program(),
feed={'x': x_tensor,
'y': y_tensor},
fetchlist=[avg_cost])
train_reader = paddle.batch(
paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
for data in train_reader():
cost = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[cost])
```
</font>
---
### 常见问题
<font size=5>
- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
```python
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
for pass_id in range(PASS_NUM):
accuracy.reset()
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
# acc 当前一个batch 的 accuracy
# pass_acc 当前batch 的 accuracy
pass_total_acc = accuracy.eval(exe) # 整个pass的accuracy
```
- 如何在训练中测试?[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
- 如何保存训练好的模型?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
- 如何加载训练好的模型进行预测?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
- 如何在同一个训练任务中定义多个Program,并交替运行? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
- 如何profile?Fluid 实现了profile 工具,可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
</font>
---
......@@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
# 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像
docker build -t paddle:dev .
# 3. 执行下面的命令编译CPU-Only的二进制
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步)
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。如果使用自行
构建的镜像(上述第4步)会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
最后的执行脚本的命令。
注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
......@@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` ):
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
bash /paddle/paddle/scripts/docker/build.sh
cd /paddle/build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
./paddle/scripts/paddle_build.sh build
cd build
ctest -R test_sum_op -V
.. _faq_docker:
......
......@@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
# 2. Optional: build development docker image from source
docker build -t paddle:dev .
# 3. Run the following command to build a CPU-Only binaries
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
NOTE: The above command try to mount the current working directory (root directory of source code)
into :code:`/paddle` directory inside docker container. If you are using your own image
(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
command in step 3.
into :code:`/paddle` directory inside docker container.
When the compile finishes, you can get the output whl package under
build/python/dist, then you can choose to install the whl on local
......@@ -74,15 +72,15 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
If you wish to run only one unit test, like :code:`test_sum_op`:
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
bash /paddle/paddle/scripts/docker/build.sh
cd /paddle/build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
./paddle/scripts/paddle_build.sh build
cd build
ctest -R test_sum_op -V
.. _faq_docker:
......
......@@ -98,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
国内用户可以使用下面的镜像源来加速访问:
.. code-block: bash
.. code-block:: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
......
......@@ -105,7 +105,7 @@ We provide a packaged book image, simply issue the command:
For users in China, we provide a faster mirror:
.. code-block: bash
.. code-block:: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
......
......@@ -11,7 +11,6 @@ GTAGS
*.pb.cc
*.pb.h
*_pb2.py
paddle_*
output/
google/
Makefile
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
add_subdirectory(inference)
......@@ -89,7 +89,7 @@ cd Paddle
# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
nvidia-docker build -t paddle:float16 .
# After running this, different results will be written to different log files in Paddle/contrib/float16/
nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
```
#### Accuracy
......
......@@ -3,7 +3,7 @@
BUILD_PATH=/paddle/fp16_build
WHEEL_PATH=$BUILD_PATH/python/dist
INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
DEMO_PATH=/paddle/contrib/float16
DEMO_PATH=/paddle/paddle/contrib/float16
# Use the single most powerful CUDA GPU on your machine
export CUDA_VISIBLE_DEVICES=0
......@@ -50,7 +50,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
--repeat=$REPEAT \
......@@ -68,7 +67,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_resnet \
--data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
--repeat=$REPEAT \
......@@ -86,7 +84,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
--repeat=$REPEAT \
......@@ -104,7 +101,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
--repeat=$REPEAT \
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS})
list(APPEND arg_list "_${arg}")
endforeach()
else()
list(APPEND arg_list "_")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(${TARGET_NAME}
SRCS ${TEST_SRC}
DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
# set_tests_properties(${TARGET_NAME}
# PROPERTIES DEPENDS ${DEP_TEST})
endforeach()
endfunction(inference_api_test)
cc_library(paddle_inference_api
SRCS paddle_inference_api.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_impl
SRCS paddle_inference_api_impl.cc
DEPS paddle_inference_api paddle_fluid_api)
cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api)
inference_api_test(test_paddle_inference_api_impl
test_paddle_inference_api_impl.cc
test_word2vec)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h"
......@@ -12,49 +12,74 @@
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains the definition of a simple Inference API for Paddle.
*
* ATTENTION: It requires some C++ features, for lower version C++ or C, we
* might release another API.
*/
#pragma once
#include <memory>
#include <string>
#include <vector>
namespace paddle {
class Predictor {
public:
struct Attr;
Predictor() = default;
enum PaddleDType {
FLOAT32,
INT64,
};
// Build the network before inference.
bool Init(const Attr& attr);
struct PaddleBuf {
void* data; // pointer to the data memory.
size_t length; // number of memory bytes.
};
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
/*
* A simple Inference API for Paddle. Currently this API might just be used by
* non-sequence scenerios.
* TODO(Superjomn) Prepare another API for NLP-related usages.
*/
class PaddlePredictor {
public:
struct Config;
PaddlePredictor() = default;
PaddlePredictor(const PaddlePredictor&) = delete;
// Predict an record.
// Arguments:
// inputs: the name of the input variables.
// outputs: the name of the output varaibles.
// input_shapes: the shape of the input variables.
// output_shapes: the shape of the output variables.
// input_data: the data of the input variables.
// output_data: the data of the output variables.
bool Run(const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const std::vector<std::vector<int>>& input_shapes,
const std::vector<std::vector<int>>& output_shapes,
const std::vector<std::vector<float>>& input_data,
std::vector<std::vector<float>>* output_data);
// Clone a predictor that share the model weights.
Predictor* Clone();
// The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be alive until Run returns. caller should be
// responsible for releasing the memory of `output_data`.
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0;
// Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe.
virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
// Destroy the Predictor.
~Predictor();
virtual ~PaddlePredictor() {}
struct Attr {
friend std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
const PaddlePredictor::Config& config);
// The common configs for all the predictors.
struct Config {
enum class EngineKind;
std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on
// third-party engines.
EngineKind engine_kind{Attr::EngineKind::kNone};
// third-party engines.
EngineKind engine_kind{Config::EngineKind::kNone};
enum class EngineKind {
kNone = -1, // Use the native Fluid facility.
......@@ -66,4 +91,8 @@ public:
};
};
// A factory to help create difference predictor.
template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sys/time.h>
#include <algorithm>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
namespace paddle {
namespace {
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
template <class T>
std::string num2str(T a) {
std::stringstream istr;
istr << a;
return istr.str();
}
} // namespace
bool PaddlePredictorImpl::Init() {
VLOG(3) << "Predictor::init()";
// TODO(panyx0718): Should CPU vs GPU device be decided by id?
if (config_.device >= 0) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
}
paddle::framework::InitDevices(false);
executor_.reset(new paddle::framework::Executor(place_));
scope_.reset(new paddle::framework::Scope());
// Initialize the inference program
if (!config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << "fail to load inference model.";
return false;
}
ctx_ = executor_->Prepare(*inference_program_, 0);
// Create variables
// TODO(panyx0718): Why need to test share_variables here?
if (config_.share_variables) {
executor_->CreateVariables(*inference_program_, scope_.get(), 0);
}
// Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true;
}
bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
VLOG(3) << "Predictor::predict";
Timer timer;
timer.tic();
// set feed variable
std::map<std::string, const paddle::framework::LoDTensor *> feed_targets;
std::vector<paddle::framework::LoDTensor> feeds;
if (!SetFeed(inputs, &feeds)) {
LOG(ERROR) << "fail to set feed";
return false;
}
for (size_t i = 0; i < feed_target_names_.size(); ++i) {
feed_targets[feed_target_names_[i]] = &feeds[i];
}
// get fetch variable
std::map<std::string, paddle::framework::LoDTensor *> fetch_targets;
std::vector<paddle::framework::LoDTensor> fetchs;
fetchs.resize(fetch_target_names_.size());
for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
fetch_targets[fetch_target_names_[i]] = &fetchs[i];
}
// Run the inference program
// if share variables, we need not create variables
executor_->RunPreparedContext(ctx_.get(),
scope_.get(),
&feed_targets,
&fetch_targets,
!config_.share_variables);
if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs";
return false;
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
return true;
}
std::unique_ptr<PaddlePredictor> PaddlePredictorImpl::Clone() {
VLOG(3) << "Predictor::clone";
std::unique_ptr<PaddlePredictorImpl> cls(new PaddlePredictorImpl(config_));
if (!cls->InitShared(this)) {
LOG(ERROR) << "fail to call InitShared";
return nullptr;
}
return cls;
}
// TODO(panyx0718): Consider merge with Init()?
bool PaddlePredictorImpl::InitShared(PaddlePredictorImpl *cls) {
VLOG(3) << "Predictor::init_shared";
// 1. Define place, executor, scope
if (this->config_.device >= 0) {
place_ = paddle::platform::CUDAPlace();
} else {
place_ = paddle::platform::CPUPlace();
}
this->executor_.reset(new paddle::framework::Executor(this->place_));
this->scope_.reset(new paddle::framework::Scope());
// Initialize the inference program
if (!this->config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
this->inference_program_ = paddle::inference::Load(
this->executor_.get(), this->scope_.get(), this->config_.model_dir);
} else if (!this->config_.prog_file.empty() &&
!this->config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
this->inference_program_ =
paddle::inference::Load(this->executor_.get(),
this->scope_.get(),
this->config_.prog_file,
this->config_.param_file);
}
this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
// 3. create variables
// TODO(panyx0718): why test share_variables.
if (config_.share_variables) {
this->executor_->CreateVariables(
*this->inference_program_, this->scope_.get(), 0);
}
// 4. Get the feed_target_names and fetch_target_names
this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
return true;
}
bool PaddlePredictorImpl::SetFeed(
const std::vector<PaddleTensor> &inputs,
std::vector<paddle::framework::LoDTensor> *feeds) {
VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feed_target_names_.size()) {
LOG(ERROR) << "wrong feed input size.";
return false;
}
for (size_t i = 0; i < feed_target_names_.size(); ++i) {
paddle::framework::LoDTensor input;
paddle::framework::DDim ddim =
paddle::framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr =
input.mutable_data<int64_t>(ddim, paddle::platform::CPUPlace());
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, paddle::platform::CPUPlace());
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr),
inputs[i].data.data,
inputs[i].data.length);
feeds->push_back(input);
LOG(ERROR) << "Actual feed type " << feeds->back().type().name();
}
return true;
}
bool PaddlePredictorImpl::GetFetch(
const std::vector<paddle::framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *outputs) {
VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs.size());
for (size_t i = 0; i < fetchs.size(); ++i) {
// TODO(panyx0718): Support fetch of other types.
if (fetchs[i].type() != typeid(float)) {
LOG(ERROR) << "only support fetching float now.";
return false;
}
std::vector<int> shape;
auto dims_i = fetchs[i].dims();
auto lod = fetchs[i].lod();
const float *output_ptr = fetchs[i].data<float>();
// const int64_t* output_ptr = fetchs[i].data<int64_t>();
auto num = fetchs[i].numel();
std::vector<float> data;
if (0 == lod.size()) {
std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
for (int j = 0; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
} else {
// for batch detection
// image[0] -> output[0] shape {145, 6}
// image[1] -> output[1] shape {176, 6}
// then,
// the batch output shape {321, 6}
// the lod {{0, 145, 321}}
// so we should append output[0] to {176, 6}
size_t max_dim = 0;
for (size_t j = 1; j < lod[0].size(); j++) {
max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
}
size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
if (max_dim > 0) {
data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
}
for (size_t j = 1; j < lod[0].size(); j++) {
size_t start = lod[0][j - 1] * common_dim;
size_t end = lod[0][j] * common_dim;
if (end > start) {
std::copy(output_ptr + start,
output_ptr + end,
data.begin() + (j - 1) * max_dim * common_dim);
}
}
shape.push_back(lod[0].size() - 1);
shape.push_back(max_dim);
for (int j = 1; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
}
outputs->at(i).shape = shape;
outputs->at(i).data.length = sizeof(float) * data.size();
outputs->at(i).data.data = malloc(outputs->at(i).data.length);
std::memcpy(
outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
outputs->at(i).dtype = PaddleDType::FLOAT32;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
}
return true;
}
std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
const VisConfig &config) {
VLOG(3) << "create PaddlePredictorImpl";
// 1. GPU memeroy
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
num2str<float>(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
}
std::unique_ptr<PaddlePredictorImpl> predictor(
new PaddlePredictorImpl(config));
if (!predictor->Init()) {
return nullptr;
}
return predictor;
}
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
struct VisConfig : public PaddlePredictor::Config {
int device;
float fraction_of_gpu_memory;
std::string prog_file;
std::string param_file;
bool share_variables;
};
/*
* Do not use this, just a demo indicating how to customize a Predictor.
*/
class PaddlePredictorImpl : public PaddlePredictor {
public:
explicit PaddlePredictorImpl(const VisConfig &config) : config_(config) {}
bool Init();
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override;
std::unique_ptr<PaddlePredictor> Clone() override;
~PaddlePredictorImpl() override{};
private:
bool InitShared(PaddlePredictorImpl *cls);
bool SetFeed(const std::vector<PaddleTensor> &input_datas,
std::vector<paddle::framework::LoDTensor> *feeds);
bool GetFetch(const std::vector<paddle::framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *output_data);
VisConfig config_;
paddle::platform::Place place_;
std::unique_ptr<paddle::framework::Executor> executor_;
std::unique_ptr<paddle::framework::Scope> scope_;
std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx_;
std::unique_ptr<paddle::framework::ProgramDesc> inference_program_;
std::vector<std::string> feed_target_names_;
std::vector<std::string> fetch_target_names_;
};
std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
const VisConfig &config);
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
namespace paddle {
/*
* Do not use this, just a demo indicating how to customize a config for a
* specific predictor.
*/
struct DemoConfig : public PaddlePredictor::Config {
float other_config;
};
/*
* Do not use this, just a demo indicating how to customize a Predictor.
*/
class DemoPredictor : public PaddlePredictor {
public:
explicit DemoPredictor(const DemoConfig &config) {
LOG(INFO) << "I get other_config " << config.other_config;
}
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override {
LOG(INFO) << "Run";
return false;
}
std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
~DemoPredictor() override {}
};
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
const DemoConfig &config) {
std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
return x;
}
TEST(paddle_inference_api, demo) {
DemoConfig config;
config.other_config = 1.7;
auto predictor = CreatePaddlePredictor(config);
std::vector<PaddleTensor> outputs;
predictor->Run({}, &outputs);
}
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
namespace paddle {
PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
PaddleTensor pt;
pt.data.data = t->data<void>();
if (t->type() == typeid(int64_t)) {
pt.data.length = t->numel() * sizeof(int64_t);
pt.dtype = PaddleDType::INT64;
} else if (t->type() == typeid(float)) {
pt.data.length = t->numel() * sizeof(float);
pt.dtype = PaddleDType::FLOAT32;
} else {
LOG(FATAL) << "unsupported type.";
}
pt.shape = framework::vectorize2int(t->dims());
return pt;
}
TEST(paddle_inference_api_impl, word2vec) {
VisConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.85;
config.device = 0;
config.share_variables = true;
std::unique_ptr<PaddlePredictorImpl> predictor =
CreatePaddlePredictorImpl(config);
framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}};
int64_t dict_size = 2073; // The size of dictionary
SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
std::vector<PaddleTensor> cpu_feeds;
cpu_feeds.push_back(LodTensorToPaddleTensor(&first_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&second_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&third_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(cpu_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1);
for (size_t i = 0; i < outputs.size(); ++i) {
size_t len = outputs[i].data.length;
float* data = static_cast<float*>(outputs[i].data.data);
for (int j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0);
}
free(outputs[i].data.data);
}
}
} // namespace paddle
......@@ -243,13 +243,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
}
void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
for (auto &output : outputs_) {
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
RenameInput(old_name, new_name);
RenameOutput(old_name, new_name);
need_update_ = true;
}
......@@ -274,6 +269,13 @@ void OpDesc::RenameInput(const std::string &old_name,
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
if (it != attrs_.end()) {
auto &op_vars = boost::get<std::vector<std::string>>(it->second);
std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
}
need_update_ = true;
}
......
......@@ -63,6 +63,7 @@ class InferShapeContext {
std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
......@@ -81,8 +82,6 @@ class InferShapeContext {
const std::vector<std::string> &names) const;
virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
};
} // namespace framework
......
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api
SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
......
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
SERIAL)
# This test is not stable
# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828
#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
# SERIAL)
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
......@@ -149,12 +149,14 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
// GPU data is copied to CPU buffer when sending,
// free the buffer when possible.
destroy_callback = [](void* backing) {
platform::CUDAPinnedPlace cuda_pinned;
memory::Free(cuda_pinned, backing);
};
#endif
}
std::string header;
......
......@@ -24,6 +24,8 @@ detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
detection_library(target_assign_op SRCS target_assign_op.cc
target_assign_op.cu)
detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu)
# Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto* in = ctx.Input<Tensor>("Input");
auto in_dims = in->dims();
const T* in_data = in->data<T>();
auto* out = ctx.Output<Tensor>("Output");
T* out_data = out->mutable_data<T>(ctx.GetPlace());
int batch_size = in_dims[0];
int geo_channel = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int id = 0;
for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
for (int id_h = 0; id_h < height; ++id_h) {
for (int id_w = 0; id_w < width; ++id_w) {
id = id_n * height * width + width * id_h + id_w;
if (id_n % 2 == 0) {
out_data[id] = id_w - in_data[id];
} else {
out_data[id] = id_h - in_data[id];
}
}
}
}
}
};
class PolygonBoxTransformOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("Input"),
"Input (Input) of polygon_box transform op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Output"),
"Output (Output) of polygon_box transform op should not be null.");
auto in_dim = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
"input's second dimension must be even.");
ctx->SetOutputDim("Output", in_dim);
}
};
class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"Input",
"The input with shape [batch_size, geometry_channels, height, width]");
AddOutput("Output", "The output with the same shape as input");
AddComment(R"DOC(
PolygonBoxTransform Operator.
The input is the final geometry output in detection network.
We use 2*n numbers to denote the coordinate shift from n corner vertices of
the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
the geometry output contains 2*n channels.
PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
ops::PolygonBoxTransformOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
polygon_box_transform,
ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_BLOCK_SIZE 16
template <typename T>
__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
const T* input, T* output) {
int id_n = threadIdx.x + blockDim.x * blockIdx.x;
int id_h = threadIdx.y + blockDim.y * blockIdx.y;
int id_w = threadIdx.z + blockDim.z * blockIdx.z;
if (id_n < n && id_h < h && id_w < w) {
int id = id_n * h * w + w * id_h + id_w;
if (id_n % 2 == 0) {
output[id] = id_w - input[id];
} else {
output[id] = id_h - input[id];
}
}
}
template <typename T>
class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto* in = ctx.Input<Tensor>("Input");
auto in_dims = in->dims();
const T* in_data = in->data<T>();
auto* out = ctx.Output<Tensor>("Output");
T* out_data = out->mutable_data<T>(ctx.GetPlace());
int batch_size = in_dims[0];
int geo_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
dim3 threadsPerBlock(
PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
(height + threadsPerBlock.y - 1) / threadsPerBlock.y,
(width + threadsPerBlock.z - 1) / threadsPerBlock.z);
auto stream = ctx.cuda_device_context().stream();
PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
batch_size * geo_channels, height, width, in_data, out_data);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
polygon_box_transform,
paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
......@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Output<framework::Tensor>("Out");
auto* in = ctx.Input<framework::LoDTensor>("Input");
if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
// set the correct batch size for the LoDTensor.
auto odims = out->dims();
int output_dim_idx = ctx.Attr<int>("output_dim_idx");
odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
out->mutable_data<T>(odims, ctx.GetPlace());
}
out->mutable_data<T>(ctx.GetPlace());
auto value = ctx.Attr<float>("value");
......
......@@ -46,7 +46,10 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
const int64_t* label_data = labels->data<int64_t>();
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
int lbl = label_data[i];
PADDLE_ENFORCE_GE(lbl, 0);
PADDLE_ENFORCE_LT(lbl, class_num);
int index = i * class_num + lbl;
loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
}
}
......
......@@ -23,6 +23,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
# Export local libraries to parent
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class CustomReader : public framework::DecoratedReader {
public:
CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
const platform::Place& dev_place,
const std::vector<std::string>& source_var_names,
const std::vector<std::string>& sink_var_names)
: DecoratedReader(reader),
program_(*sub_block.Program()),
sub_block_id_(sub_block.ID()),
exe_(framework::Executor(dev_place)),
source_var_names_(source_var_names),
sink_var_names_(sink_var_names) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override;
private:
const framework::ProgramDesc program_;
int sub_block_id_;
framework::Executor exe_;
std::vector<std::string> source_var_names_;
std::vector<std::string> sink_var_names_;
};
class CreateCustomReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
out->Reset(
new CustomReader(underlying_reader.Get(), *sub_block, dev_place,
Attr<std::vector<std::string>>("source_var_names"),
Attr<std::vector<std::string>>("sink_var_names")));
}
};
class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
protected:
void Apply() override {
AddAttr<framework::BlockDesc*>(
"sub_block", "The block to hold all preprocessing operators.");
AddAttr<std::vector<std::string>>(
"source_var_names",
"Source variables are starting points of data preprocessing. They hold "
"preprocessing's input tensors. Each source variable corresponds to "
"one of underlying reader's output datas.");
AddAttr<std::vector<std::string>>(
"sink_var_names",
"Sink variables are ending points of data preprocessing. They hold "
"preprocessing's output tensors. Each sink variable corresponds to "
"one of custom reader's output datas.");
AddComment(R"DOC(
CreateCustomReader Operator
A custom reader can be used for input data preprocessing.
A custom reader holds its own sub-block, which will be executed in its
'ReadNext()' function. Users can configurate their own preprocessing
pipelines by inserting operators into custom reader's sub-block.
)DOC");
}
};
class CustomReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(!ctx->IsRuntime(),
"'CustomReaderInferShape' should only be invoked during "
"compile time.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output decorated reader should not be null.");
const auto* sub_block =
ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
const auto sink_var_names =
ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
std::vector<std::vector<int64_t>> res_dims;
std::vector<int32_t> res_lod_levels;
for (const std::string& var_name : sink_var_names) {
auto* sink_var = sub_block->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(sink_var);
res_dims.emplace_back(sink_var->GetShape());
res_lod_levels.push_back(sink_var->GetLoDLevel());
}
auto* out_reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetShapes(res_dims);
out_reader->SetLoDLevels(res_lod_levels);
}
};
class CustomReaderInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
PADDLE_ENFORCE_NOT_NULL(out_reader);
out_reader->SetType(framework::proto::VarType::READER);
auto sink_var_names =
boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
const auto* sub_block =
boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
std::vector<framework::proto::VarType::Type> res_data_types;
for (const std::string& var_name : sink_var_names) {
framework::VarDesc* var = sub_block->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
res_data_types.emplace_back(var->GetDataType());
}
out_reader->SetDataTypes(res_data_types);
}
};
void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
out->clear();
std::vector<framework::LoDTensor> underlying_outs;
reader_->ReadNext(&underlying_outs);
if (underlying_outs.empty()) {
// There is not next data.
return;
}
PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
"The size of source_var_names(%d) and the size of "
"underlying_outs(%d) are not consistent. Each feeding element "
"must have its own source variable.",
source_var_names_.size(), underlying_outs.size());
// The scope for CustomReader's sub-block should be independent and shouldn't
// be any other computation scope's child. Otherwise, data preprocessing and
// compution cannot be concurrent.
framework::Scope scope;
// 1. Copy LoDTensors from underlying reader's output to source variables.
for (size_t i = 0; i < source_var_names_.size(); ++i) {
framework::Variable* var = scope.Var(source_var_names_[i]);
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
tensor->ShareDataWith(underlying_outs[i]);
tensor->set_lod(underlying_outs[i].lod());
}
// 2. Run the sub-block.
exe_.Run(program_, &scope, sub_block_id_, false, true);
// 3. Copy LoDTensors from sink variables to out.
out->resize(sink_var_names_.size());
for (size_t i = 0; i < sink_var_names_.size(); ++i) {
const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
.Get<framework::LoDTensor>();
framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
}
}
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
ops::CustomReaderInferVarType,
paddle::framework::EmptyGradOpMaker)
......@@ -115,6 +115,7 @@ void DecoratedReaderInferShape::operator()(
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetLoDLevels(in_reader->GetLoDLevels());
}
void DecoratedReaderInferVarType::operator()(
const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include "paddle/function/EigenThreadDevice.h"
namespace paddle {
......@@ -70,25 +70,26 @@ struct EigenBlasGemm {
dims[0].first = transA ? 0 : 1;
dims[0].second = transB ? 1 : 0;
Eigen::DefaultDevice device;
auto* device = EigenDeviceWarpper::device();
if (N == ldc) {
if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims);
c.device(*device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.device(device) += a.contract(b, dims);
c.device(*device) += a.contract(b, dims);
} else {
c.device(device) = alpha * a.contract(b, dims) + beta * c;
c.device(*device) = alpha * a.contract(b, dims) + beta * c;
}
} else {
if (alpha == T(1) && beta == T(0)) {
c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
} else {
c.slice(offsetC, extentC).device(device) =
c.slice(offsetC, extentC).device(*device) =
alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
}
}
EigenDeviceWarpper::free_device(device);
}
};
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#if defined(__OSX__) || defined(__APPLE__)
#include <sys/sysctl.h>
#include <sys/types.h>
#endif
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
#if defined(__ANDROID__)
int GetCpuCount() {
FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
if (!fp) {
return 1;
}
int rank0, rank1;
int num = fscanf(fp, "%d-%d", &rank0, &rank1);
fclose(fp);
if (num < 2) return 1;
return rank1 + 1;
}
#elif defined(__OSX__) || defined(__APPLE__)
int GetCpuCount() {
int count = 0;
size_t len = sizeof(int);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
return count > 0 ? count : 1;
}
#else
int GetCpuCount() { return 1; }
#endif
class EigenDeviceWarpper {
public: // NOLINT
#if EIGEN_USE_THREADS
static Eigen::ThreadPoolDevice* device() {
const int num_cpus = GetCpuCount();
const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
static Eigen::ThreadPool tp(num_threads);
static Eigen::ThreadPoolDevice* device =
new Eigen::ThreadPoolDevice(&tp, num_threads);
return device;
}
static void free_device(Eigen::ThreadPoolDevice* device) {
// do nothing
}
#else
static Eigen::DefaultDevice* device() {
Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
return device;
}
static void free_device(Eigen::DefaultDevice* device) { delete device; }
#endif
};
} // namespace paddle
......@@ -7,6 +7,10 @@ set(OPITMIZER_SRCS
sgd_optimizer.cc
)
cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
add_library(paddle_optimizer ${OPITMIZER_SRCS})
target_link_libraries(paddle_optimizer paddle_proto glog)
if (WITH_TESTING)
add_unittest(serialization_test serialization_test.cc)
add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
endif()
#!/bin/bash
function cmake_gen() {
mkdir -p /paddle/build
cd /paddle/build
# build script will not fail if *.deb does not exist
rm *.deb 2>/dev/null || true
# delete previous built whl packages
rm -rf /paddle/paddle/dist 2>/dev/null || true
# Support build for all python versions, currently
# including cp27-cp27m and cp27-cp27mu.
PYTHON_FLAGS=""
if [ "$1" != "" ]; then
echo "using python abi: $1"
if [ "$1" == "cp27-cp27m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
elif [ "$1" == "cp27-cp27mu" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
fi
fi
cat <<EOF
========================================
Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
${PYTHON_FLAGS}
-DWITH_DSO=ON
-DWITH_DOC=${WITH_DOC:-OFF}
-DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-OFF}
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-DWITH_SWIG_PY=ON
-DWITH_C_API=${WITH_C_API:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON}
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/
-DWITH_TESTING=${WITH_TESTING:-ON}
-DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
# docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
${PYTHON_FLAGS} \
-DWITH_DSO=ON \
-DWITH_DOC=${WITH_DOC:-OFF} \
-DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-DWITH_C_API=${WITH_C_API:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \
-DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
}
function run_build() {
cat <<EOF
============================================
Building in /paddle/build ...
============================================
EOF
make clean
make -j `nproc`
}
function run_test() {
if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
cat <<EOF
========================================
Running unit tests ...
========================================
EOF
ctest --output-on-failure
# make install should also be test when unittest
make install -j `nproc`
pip install /usr/local/opt/paddle/share/wheels/*.whl
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version
fi
fi
}
function gen_docs() {
if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
cat <<EOF
========================================
Building documentation ...
In /paddle/build_doc
========================================
EOF
mkdir -p /paddle/build_doc
pushd /paddle/build_doc
cmake .. \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \
-DWITH_AVX=${WITH_AVX:-ON} \
-DWITH_SWIG_PY=ON
make -j `nproc` paddle_docs paddle_apis
popd
fi
if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
cat <<EOF
========================================
Converting C++ source code into HTML ...
========================================
EOF
export WOBOQ_OUT=/paddle/build/woboq_out
mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data
/woboq/generator/codebrowser_generator \
-b /paddle/build \
-a \
-o $WOBOQ_OUT \
-p paddle:/paddle
/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
fi
}
function gen_dockerfile() {
# Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
fi
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
cat <<EOF
========================================
Generate /paddle/build/Dockerfile ...
========================================
EOF
cat > /paddle/build/Dockerfile <<EOF
FROM ${BASE_IMAGE}
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV HOME /root
EOF
if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
else
NCCL_DEPS=""
fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
PADDLE_VERSION="paddle version"
CMD='"paddle", "version"'
else
PADDLE_VERSION="true"
CMD='"true"'
fi
cat >> /paddle/build/Dockerfile <<EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&\
${NCCL_DEPS}\
apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
pip install /*.whl; apt-get install -f -y && \
apt-get clean -y && \
rm -f /*.whl && \
${PADDLE_VERSION} && \
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
cat >> /paddle/build/Dockerfile <<EOF
ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/
EOF
fi
cat >> /paddle/build/Dockerfile <<EOF
# default command shows the paddle version and exit
CMD [${CMD}]
EOF
}
function gen_capi_package() {
if [[ ${WITH_C_API} == "ON" ]]; then
install_prefix="/paddle/build/capi_output"
rm -rf $install_prefix
make DESTDIR="$install_prefix" install
cd $install_prefix/usr/local
ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
fi
}
function gen_fluid_inference_lib() {
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF
========================================
Deploying fluid inference library ...
========================================
EOF
make -j `nproc` inference_lib_dist
fi
}
set -xe
cmake_gen ${PYTHON_ABI:-""}
run_build
run_test
gen_docs
gen_dockerfile
gen_capi_package
gen_fluid_inference_lib
if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
else
printf "If you need to install PaddlePaddle in develop docker image,"
printf "please make install or pip install build/python/dist/*.whl.\n"
fi
#!/bin/bash
set -xe
if [ $ANDROID_ABI == "arm64-v8a" ]; then
ANDROID_ARCH=arm64
if [ $ANDROID_API -lt 21 ]; then
echo "Warning: arm64-v8a requires ANDROID_API >= 21."
ANDROID_API=21
fi
else # armeabi, armeabi-v7a
ANDROID_ARCH=arm
fi
ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
cat <<EOF
============================================
Generating the standalone toolchain ...
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
--arch=$ANDROID_ARCH
--platform=android-$ANDROID_API
--install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
============================================
EOF
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
--arch=$ANDROID_ARCH \
--platform=android-$ANDROID_API \
--install-dir=$ANDROID_STANDALONE_TOOLCHAIN
BUILD_ROOT=/paddle/build_android
DEST_ROOT=/paddle/install_android
mkdir -p $BUILD_ROOT
cd $BUILD_ROOT
if [ $ANDROID_ABI == "armeabi-v7a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
..
elif [ $ANDROID_ABI == "arm64-v8a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DUSE_EIGEN_FOR_BLAS=OFF \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
..
elif [ $ANDROID_ABI == "armeabi" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
..
else
echo "Invalid ANDROID_ABI: $ANDROID_ABI"
fi
cat <<EOF
============================================
Building in $BUILD_ROOT ...
============================================
EOF
make -j `nproc`
make install -j `nproc`
#!/bin/bash
/usr/sbin/sshd -D &
jupyter notebook --ip=0.0.0.0 /paddle/book/
#!/bin/bash
set -e
# the number of process to run tests
NUM_PROC=6
# calculate and set the memory usage for each process
MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
# get the CUDA device count
CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
for (( i = 0; i < $NUM_PROC; i++ )); do
cuda_list=()
for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
s=$[i+j]
n=$[s%CUDA_DEVICE_COUNT]
if [ $j -eq 0 ]; then
cuda_list=("$n")
else
cuda_list="$cuda_list,$n"
fi
done
echo $cuda_list
# CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
# ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
done
wait
......@@ -104,6 +104,8 @@ function cmake_gen() {
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=ON
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
......@@ -129,7 +131,8 @@ EOF
-DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=ON
}
function abort(){
......
#!/bin/bash
set -e
# Create the build directory for CMake.
mkdir -p $TRAVIS_BUILD_DIR/build
cd $TRAVIS_BUILD_DIR/build
# Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make -j `nproc` paddle_docs paddle_apis
# check websites for broken links
linkchecker doc/v2/en/html/index.html
linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html
#!/bin/bash
set -e
# Create the build directory for CMake.
mkdir -p $TRAVIS_BUILD_DIR/build_ios
cd $TRAVIS_BUILD_DIR/build_ios
# Compile paddle binaries
cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \
-DCMAKE_OSX_ARCHITECTURES="arm64" \
-DWITH_C_API=ON \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_TESTING=OFF \
-DWITH_SWIG_PY=OFF \
-DCMAKE_BUILD_TYPE=Release \
..
make -j 2
#!/bin/bash
function abort(){
echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
echo "Please use pre-commit to check what is wrong." 1>&2
exit 1
}
trap 'abort' 0
set -e
# install glide
curl https://glide.sh/get | bash
eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
# set up go environment for running gometalinter
mkdir -p $GOPATH/src/github.com/PaddlePaddle/
ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
cd $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
go get github.com/alecthomas/gometalinter
gometalinter --install
cd $TRAVIS_BUILD_DIR
export PATH=/usr/bin:$PATH
pre-commit install
clang-format --version
if ! pre-commit run -a ; then
git diff
exit 1
fi
trap : 0
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from .. import core
from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
......@@ -21,7 +22,8 @@ from ..executor import global_scope
__all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
'random_data_generator', 'Preprocessor'
]
......@@ -535,8 +537,6 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
inputs={'UnderlyingReader': reader},
outputs={'Out': [new_reader]},
attrs=attrs)
new_reader.persistable = True
new_reader.stop_gradient = True
return monkey_patch_reader_methods(new_reader)
......@@ -581,3 +581,82 @@ def read_file(file_obj):
return out[0]
else:
return out
class Preprocessor(object):
BEFORE_SUB_BLOCK = 0
IN_SUB_BLOCK = 1
AFTER_SUB_BLOCK = 2
def __init__(self, reader, name=None):
self.underlying_reader = reader
new_reader_name = name if name is not None else unique_name(
"create_custom_reader")
self.main_prog = default_main_program()
self.reader = self.main_prog.current_block().create_var(
name=new_reader_name)
self.sub_block = None
self.source_var_names = None
self.sink_var_names = None
self.status = Preprocessor.BEFORE_SUB_BLOCK
def is_completed(self):
return self.sub_block and self.source_var_names and self.sink_var_names
@contextlib.contextmanager
def block(self):
self.status = Preprocessor.IN_SUB_BLOCK
self.sub_block = self.main_prog.create_block()
yield
self.main_prog.rollback()
self.status = Preprocessor.AFTER_SUB_BLOCK
if not self.is_completed():
raise RuntimeError(
"The definition of preprocessor is incompleted! "
"Please make sure that you have set input and output "
"variables by invoking 'inputs' and 'outputs' in "
"Preprocessor's sub-block.")
def inputs(self):
if self.status != Preprocessor.IN_SUB_BLOCK:
raise RuntimeError(
"Preprocessor.inputs() can only be invoked inside the sub-block."
)
source_shapes = self.underlying_reader.desc.shapes()
source_dtypes = self.underlying_reader.desc.dtypes()
source_lod_levels = self.underlying_reader.desc.lod_levels()
self.source_var_names = [
unique_name("preprocessor_source")
for _ in xrange(len(source_shapes))
]
source_vars = []
for var_name, shape, dtype, lod_level in zip(
self.source_var_names, source_shapes, source_dtypes,
source_lod_levels):
source_vars.append(self.main_prog.current_block().create_var(
name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
return source_vars
def outputs(self, *outs):
if self.status != Preprocessor.IN_SUB_BLOCK:
raise RuntimeError(
"Preprocessor.outputs() can only be invoked inside the sub-block."
)
self.sink_var_names = [var.name for var in outs]
def __call__(self, *args, **kwargs):
if self.status != Preprocessor.AFTER_SUB_BLOCK:
raise RuntimeError(
"Preprocessor output can only be retrieved after rnn block.")
self.main_prog.current_block().append_op(
type="create_custom_reader",
inputs={'UnderlyingReader': self.underlying_reader},
outputs={'Out': [self.reader]},
attrs={
"sub_block": self.sub_block,
"source_var_names": self.source_var_names,
"sink_var_names": self.sink_var_names
})
return monkey_patch_reader_methods(self.reader)
......@@ -24,19 +24,65 @@ from tensor import concat
import utils
__all__ = [
'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d',
'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm',
'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit',
'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
'beam_search', 'row_conv', 'multiplex', 'layer_norm',
'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
'label_smooth', 'roi_pool', 'dice_loss', 'bilinear_interp', 'random_crop'
'fc',
'embedding',
'dynamic_lstm',
'dynamic_lstmp',
'dynamic_gru',
'gru_unit',
'linear_chain_crf',
'crf_decoding',
'cos_sim',
'cross_entropy',
'square_error_cost',
'chunk_eval',
'sequence_conv',
'conv2d',
'sequence_pool',
'sequence_softmax',
'softmax',
'pool2d',
'batch_norm',
'beam_search_decode',
'conv2d_transpose',
'sequence_expand',
'lstm_unit',
'reduce_sum',
'reduce_mean',
'reduce_max',
'reduce_min',
'reduce_prod',
'sequence_first_step',
'sequence_last_step',
'dropout',
'split',
'ctc_greedy_decoder',
'edit_distance',
'l2_normalize',
'matmul',
'topk',
'warpctc',
'sequence_reshape',
'transpose',
'im2sequence',
'nce',
'beam_search',
'row_conv',
'multiplex',
'layer_norm',
'softmax_with_cross_entropy',
'smooth_l1',
'one_hot',
'autoincreased_step_counter',
'reshape',
'lod_reset',
'lrn',
'pad',
'label_smooth',
'roi_pool',
'dice_loss',
'upsampling_bilinear2d',
'random_crop',
]
......@@ -3881,8 +3927,10 @@ def dice_loss(input, label, epsilon=0.00001):
return reduce_mean(dice_score)
def bilinear_interp(input, out_h, out_w, name=None):
def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
"""
The mathematical meaning of upsampling_bilinear2d is also called
Bilinear interpolation.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this layer) on a rectilinear 2D grid.
......@@ -3894,8 +3942,13 @@ def bilinear_interp(input, out_h, out_w, name=None):
input (Variable): The input tensor of bilinear interpolation,
This is a 4-D tensor of the shape
(num_batches, channels, in_h, in_w).
out_h (int): output height of bilinear interpolation layer.
out_w (int): output width of bilinear interpolation layer.
out_shape(list|tuple|None): Output shape of bilinear interpolation
layer, the shape is (out_h, out_w).
Default: None
scale(int|None): The multiplier for the input height or width.
At least one of out_shape or scale must be set.
And out_shape has a higher priority than scale.
Default: None
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
......@@ -3906,10 +3959,27 @@ def bilinear_interp(input, out_h, out_w, name=None):
Examples:
.. code-block:: python
out = fluid.layers.bilinear_interp(input, out_h=12, out_w=12)
out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
"""
if out_shape is None and scale is None:
raise ValueError("One of out_shape and scale must not be None")
helper = LayerHelper('bilinear_interp', **locals())
dtype = helper.input_dtype()
def _is_list_or_turple_(data):
return (isinstance(data, list) or isinstance(data, tuple))
if out_shape is not None:
if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2):
raise ValueError('out_shape should be a list or tuple ',
'with length 2, (out_h, out_w).')
out_shape = list(map(int, out_shape))
out_h = out_shape[0]
out_w = out_shape[1]
else:
out_h = int(input.shape[2] * scale)
out_w = int(input.shape[3] * scale)
out = helper.create_tmp_variable(dtype)
helper.append_op(
type="bilinear_interp",
......
......@@ -93,12 +93,12 @@ def _convert_lod(lod):
def create_lod_tensor(data, lod, place):
"""Create a lod tensor from a numpy array or an existing lod tensor.
"""Create a lod tensor from a numpy array, a list, or an existing lod tensor.
Create a lod tensor by doing the following:
1. Check that the length-based input lod is valid.
2. Convert the length-based lod to a offset-based LoD.
3. Copy the data from a numpy array or a existing lod tensor to
3. Copy the data from a numpy array, a list or a existing lod tensor to
CPU or GPU device (based on input place).
4. Set the level of detail (LoD) using the offset-based LoD.
......@@ -117,7 +117,7 @@ def create_lod_tensor(data, lod, place):
for more details regarding LoD.
Args:
data: a numpy array or a LoDTensor holding the data to be copied.
data: a numpy array or a LoDTensor or a list holding the data to be copied.
lod: a list of lists indicating the length-based LoD info specified by the user.
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
......@@ -126,6 +126,18 @@ def create_lod_tensor(data, lod, place):
"""
if isinstance(data, core.LoDTensor):
return create_lod_tensor(np.array(data), lod, place)
elif isinstance(data, list):
# When input data is a list, it only deal with the case where the base element
# is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
# LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
# of words or other indexes in the sequence.
new_lod = []
for seq in data:
new_lod.append(len(seq))
assert [new_lod] == lod, "data and lod do not match"
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
return create_lod_tensor(flattened_data, lod, place)
elif isinstance(data, np.ndarray):
assert _validate_lod(lod,
data.shape[0]), "the provided lod info is invalid"
......@@ -134,9 +146,8 @@ def create_lod_tensor(data, lod, place):
tensor.set_lod(_convert_lod(lod))
return tensor
else:
raise Exception(
"data should be either a LoDTensor or a Numpy array, but you pass type %s instead"
% (type(data)))
raise TypeError(
"data should be either a LoDTensor, a Numpy array or a list")
def create_random_int_lodtensor(lod, base_shape, place, low, high):
......
......@@ -48,7 +48,7 @@ def linear():
return avg_loss
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
......@@ -68,8 +68,8 @@ def train(use_cuda, train_program, save_dirname):
['15.343549569447836']
...
'''
if save_dirname is not None:
trainer.save_params(save_dirname)
if params_dirname is not None:
trainer.save_params(params_dirname)
trainer.stop()
trainer.train(
......@@ -80,13 +80,13 @@ def train(use_cuda, train_program, save_dirname):
# infer
def infer(use_cuda, inference_program, save_dirname=None):
if save_dirname is None:
def infer(use_cuda, inference_program, params_dirname=None):
if params_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
batch_size = 10
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
......@@ -100,10 +100,10 @@ def main(use_cuda):
return
# Directory for saving the trained model
save_dirname = "fit_a_line.inference.model"
params_dirname = "fit_a_line.inference.model"
train(use_cuda, linear, save_dirname)
infer(use_cuda, inference_program, save_dirname)
train(use_cuda, linear, params_dirname)
infer(use_cuda, inference_program, params_dirname)
class TestFitALine(unittest.TestCase):
......
......@@ -85,7 +85,7 @@ def train_network():
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
BATCH_SIZE = 128
EPOCH_NUM = 1
......@@ -105,8 +105,8 @@ def train(use_cuda, train_program, save_dirname):
print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
if save_dirname is not None:
trainer.save_params(save_dirname)
if params_dirname is not None:
trainer.save_params(params_dirname)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -122,10 +122,10 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -142,12 +142,14 @@ def main(use_cuda):
save_path = "image_classification_resnet.inference.model"
train(
use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
save_dirname=save_path)
params_dirname=save_path)
if __name__ == '__main__':
......
......@@ -64,7 +64,7 @@ def train_network():
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
BATCH_SIZE = 128
train_reader = paddle.batch(
paddle.reader.shuffle(
......@@ -82,8 +82,8 @@ def train(use_cuda, train_program, save_dirname):
print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
if save_dirname is not None:
trainer.save_params(save_dirname)
if params_dirname is not None:
trainer.save_params(params_dirname)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -99,10 +99,10 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -119,12 +119,14 @@ def main(use_cuda):
save_path = "image_classification_vgg.inference.model"
train(
use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
save_dirname=save_path)
params_dirname=save_path)
if __name__ == '__main__':
......
......@@ -141,7 +141,7 @@ def train_program():
return [avg_cost]
def train(use_cuda, train_program, save_path):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
......@@ -172,7 +172,7 @@ def train(use_cuda, train_program, save_path):
print("avg_cost: %s" % avg_cost)
if float(avg_cost) < 100.0: # Large value to increase CI speed
trainer.save_params(save_path)
trainer.save_params(params_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
float(avg_cost)))
......@@ -183,7 +183,7 @@ def train(use_cuda, train_program, save_path):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(save_path)
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
......@@ -197,10 +197,10 @@ def train(use_cuda, train_program, save_path):
feed_order=feed_order)
def infer(use_cuda, inference_program, save_path):
def infer(use_cuda, inference_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=save_path, place=place)
inference_program, param_path=params_dirname, place=place)
# Setup inputs by creating LoDTensors to represent sequences of words.
# Here each word is the basic element of these LoDTensors and the shape of
......@@ -251,9 +251,9 @@ def infer(use_cuda, inference_program, save_path):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "label_semantic_roles.inference.model"
train(use_cuda, train_program, save_path)
infer(use_cuda, inference_program, save_path)
params_dirname = "label_semantic_roles.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
......@@ -57,7 +57,7 @@ def train_program():
return [avg_cost, acc]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
......@@ -78,7 +78,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.epoch + 1, avg_cost, acc))
......@@ -100,11 +100,11 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -116,17 +116,17 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
save_dirname = "recognize_digits_conv.inference.model"
params_dirname = "recognize_digits_conv.inference.model"
# call train() with is_local argument to run distributed train
train(
use_cuda=use_cuda,
train_program=train_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
if __name__ == '__main__':
......
......@@ -44,7 +44,7 @@ def train_program():
return [avg_cost, acc]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
......@@ -62,7 +62,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.epoch + 1, avg_cost, acc))
......@@ -81,11 +81,11 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -97,17 +97,17 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
save_dirname = "recognize_digits_mlp.inference.model"
params_dirname = "recognize_digits_mlp.inference.model"
# call train() with is_local argument to run distributed train
train(
use_cuda=use_cuda,
train_program=train_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
if __name__ == '__main__':
......
......@@ -155,7 +155,7 @@ def train_program():
return [avg_cost, scale_infer]
def train(use_cuda, train_program, save_path):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.SGD(learning_rate=0.2)
......@@ -180,7 +180,7 @@ def train(use_cuda, train_program, save_path):
print("avg_cost: %s" % avg_cost)
if float(avg_cost) < 4: # Smaller value to increase CI speed
trainer.save_params(save_path)
trainer.save_params(params_dirname)
trainer.stop()
else:
print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
......@@ -197,43 +197,30 @@ def train(use_cuda, train_program, save_path):
num_epochs=1,
event_handler=event_handler,
reader=train_reader,
feed_order=[
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id',
'category_id', 'movie_title', 'score'
])
feed_order=feed_order)
def infer(use_cuda, inference_program, save_path):
def infer(use_cuda, inference_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=save_path, place=place)
def create_lod_tensor(data, lod=None):
tensor = fluid.LoDTensor()
if lod is None:
# Tensor, the shape is [batch_size, 1]
index = 0
lod_0 = [index]
for l in range(len(data)):
index += 1
lod_0.append(index)
lod = [lod_0]
tensor.set_lod(lod)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
tensor.set(flattened_data, place)
return tensor
# Generate a random input for inference
user_id = create_lod_tensor([[1]])
gender_id = create_lod_tensor([[1]])
age_id = create_lod_tensor([[0]])
job_id = create_lod_tensor([[10]])
movie_id = create_lod_tensor([[783]])
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
[[0, 5]])
inference_program, param_path=params_dirname, place=place)
# Use the first data from paddle.dataset.movielens.test() as input.
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
# where `data` is a list of sequences of index numbers, `lod` is
# the level of detail (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
# indicating that `data` consists of two sequences of length 3 and 2.
user_id = fluid.create_lod_tensor([[1]], [[1]], place)
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
age_id = fluid.create_lod_tensor([[0]], [[1]], place)
job_id = fluid.create_lod_tensor([[10]], [[1]], place)
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
place)
results = inferencer.infer(
{
......@@ -253,12 +240,15 @@ def infer(use_cuda, inference_program, save_path):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "recommender_system.inference.model"
train(use_cuda=use_cuda, train_program=train_program, save_path=save_path)
params_dirname = "recommender_system.inference.model"
train(
use_cuda=use_cuda,
train_program=train_program,
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
save_path=save_path)
params_dirname=params_dirname)
if __name__ == '__main__':
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# This test is buggy
# py_test(test_understand_sentiment_dynamic_rnn SRCS
# test_understand_sentiment_dynamic_rnn.py SERIAL)
LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
......
......@@ -64,7 +64,7 @@ def train_program(word_dict):
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
......@@ -85,7 +85,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
else:
......@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
......@@ -112,13 +112,13 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['words', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
word_dict = paddle.dataset.imdb.word_dict()
inferencer = fluid.Inferencer(
infer_func=partial(inference_program, word_dict),
param_path=save_dirname,
param_path=params_dirname,
place=place)
# Setup input by creating LoDTensor to represent sequence of words.
......@@ -143,9 +143,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, save_path)
infer(use_cuda, inference_program, save_path)
params_dirname = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
......@@ -79,7 +79,7 @@ def train_program(word_dict):
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
......@@ -100,7 +100,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
else:
......@@ -112,7 +112,7 @@ def train(use_cuda, train_program, save_dirname):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
......@@ -127,13 +127,13 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['words', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
word_dict = paddle.dataset.imdb.word_dict()
inferencer = fluid.Inferencer(
infer_func=partial(inference_program, word_dict),
param_path=save_dirname,
param_path=params_dirname,
place=place)
# Setup input by creating LoDTensor to represent sequence of words.
......@@ -158,9 +158,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, save_path)
infer(use_cuda, inference_program, save_path)
params_dirname = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
......@@ -71,7 +71,7 @@ def train_program(word_dict):
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
......@@ -92,7 +92,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
else:
......@@ -104,7 +104,7 @@ def train(use_cuda, train_program, save_dirname):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
......@@ -119,13 +119,13 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['words', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
word_dict = paddle.dataset.imdb.word_dict()
inferencer = fluid.Inferencer(
infer_func=partial(inference_program, word_dict),
param_path=save_dirname,
param_path=params_dirname,
place=place)
# Setup input by creating LoDTensor to represent sequence of words.
......@@ -150,9 +150,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "understand_sentiment_stacked_lstm.inference.model"
train(use_cuda, train_program, save_path)
infer(use_cuda, inference_program, save_path)
params_dirname = "understand_sentiment_stacked_lstm.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
......@@ -80,7 +80,7 @@ def train_program(is_sparse):
return avg_cost
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
test_reader = paddle.batch(
......@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
print("loss= ", avg_cost)
if avg_cost < 10.0:
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
if math.isnan(avg_cost):
......@@ -115,10 +115,10 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
# is simply an index to look up for the corresponding word vector and hence
......@@ -153,17 +153,17 @@ def main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "word2vec.inference.model"
params_dirname = "word2vec.inference.model"
train(
use_cuda=use_cuda,
train_program=partial(train_program, is_sparse),
save_dirname=save_path)
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=partial(inference_program, is_sparse),
save_dirname=save_path)
params_dirname=params_dirname)
if __name__ == '__main__':
......
......@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
test_reader = paddle.batch(
paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
feeding = {
'user_id': 0,
'gender_id': 1,
'age_id': 2,
'job_id': 3,
'movie_id': 4,
'category_id': 5,
'movie_title': 6,
'score': 7
}
def func_feed(feeding, data):
feed_tensors = {}
for (key, idx) in feeding.iteritems():
tensor = fluid.LoDTensor()
if key != "category_id" and key != "movie_title":
if key == "score":
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"float32")
else:
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"int64")
else:
numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
data)
lod_info = [len(item) for item in numpy_data]
offset = 0
lod = [offset]
for item in lod_info:
offset += item
lod.append(offset)
numpy_data = np.concatenate(numpy_data, axis=0)
tensor.set_lod([lod])
numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
tensor.set(numpy_data, place)
feed_tensors[key] = tensor
return feed_tensors
feed_order = [
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
'movie_title', 'score'
]
def train_loop(main_program):
exe.run(framework.default_startup_program())
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for batch_id, data in enumerate(train_reader()):
# train a mini-batch
outs = exe.run(program=main_program,
feed=func_feed(feeding, data),
feed=feeder.feed(data),
fetch_list=[avg_cost])
out = np.array(outs[0])
if (batch_id + 1) % 10 == 0:
avg_cost_set = []
for test_data in test_reader():
avg_cost_np = exe.run(
program=test_program,
feed=func_feed(feeding, test_data),
fetch_list=[avg_cost])
avg_cost_np = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[avg_cost])
avg_cost_set.append(avg_cost_np[0])
break # test only 1 segment for speeding up CI
......@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
def create_lod_tensor(data, lod=None):
tensor = fluid.LoDTensor()
if lod is None:
# Tensor, the shape is [batch_size, 1]
index = 0
lod_0 = [index]
for l in range(len(data)):
index += 1
lod_0.append(index)
lod = [lod_0]
tensor.set_lod(lod)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
tensor.set(flattened_data, place)
return tensor
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc,
......@@ -307,26 +260,33 @@ def infer(use_cuda, save_dirname=None):
# Use the first data from paddle.dataset.movielens.test() as input
assert feed_target_names[0] == "user_id"
user_id = create_lod_tensor([[1]])
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
# where `data` is a list of sequences of index numbers, `lod` is
# the level of detail (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
# indicating that `data` consists of two sequences of length 3 and 2.
user_id = fluid.create_lod_tensor([[1]], [[1]], place)
assert feed_target_names[1] == "gender_id"
gender_id = create_lod_tensor([[1]])
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
assert feed_target_names[2] == "age_id"
age_id = create_lod_tensor([[0]])
age_id = fluid.create_lod_tensor([[0]], [[1]], place)
assert feed_target_names[3] == "job_id"
job_id = create_lod_tensor([[10]])
job_id = fluid.create_lod_tensor([[10]], [[1]], place)
assert feed_target_names[4] == "movie_id"
movie_id = create_lod_tensor([[783]])
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
assert feed_target_names[5] == "category_id"
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
assert feed_target_names[6] == "movie_title"
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
[[0, 5]])
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
[[5]], place)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
......
......@@ -53,11 +53,14 @@ class TestLoDTensor(unittest.TestCase):
self.assertEqual(_convert_lod(lod), converted_lod)
def test_create_lod_tensor(self):
# Only numpy array or a fluid LoDTensor is valid input to
# create_lod_tensor function, currently a list of lists is not.
data = [[1, 2], [3, 4]]
self.assertRaises(Exception, create_lod_tensor, data, [],
# Create LoDTensor from a list
data = [[1, 2, 3], [3, 4]]
wrong_lod = [[2, 2]]
correct_lod = [[3, 2]]
self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
fluid.CPUPlace())
tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
self.assertEqual(tensor.lod(), [[0, 3, 5]])
# Create LoDTensor from numpy array
data = numpy.random.random([10, 1])
......
......@@ -481,9 +481,9 @@ class OpTest(unittest.TestCase):
def np_dtype_to_fluid_dtype(input):
"""Change the dtype of float16 numpy array
numpy float16 is binded to paddle::platform::float16
numpy float16 is binded to paddle::platform::float16
in tensor_py.h via the help of uint16 data type since
the internal memory representation of float16 is
the internal memory representation of float16 is
uint16_t in paddle and np.uint16 in numpy, which are
themselves binded together by pybind.
......@@ -491,9 +491,9 @@ class OpTest(unittest.TestCase):
input: input numpy array
Returns:
input: The dtype of input will be changed to np.uint16 if
input: The dtype of input will be changed to np.uint16 if
it is originally np.float16, such that the internal memory
of input will be reinterpreted as of dtype np.uint16.
of input will be reinterpreted as of dtype np.uint16.
"""
if input.dtype == np.float16:
input.dtype = np.uint16
......
......@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
self.check_output()
class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
def setUp(self):
self.op_type = "fill_constant_batch_size_like"
self.inputs = {
'Input': (np.random.random((31, 28)).astype("float32"),
[[0, 9, 23, 31]])
}
self.attrs = {
'value': 3.5,
'shape': [-1, 16],
'input_dim_idx': 0,
'output_dim_idx': 0
}
out = np.random.random((3, 16)).astype("float32")
out.fill(3.5)
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
......@@ -369,11 +369,13 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output)
print(str(program))
def test_bilinear_interp(self):
def test_upsampling_bilinear2d(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
output = layers.bilinear_interp(x, 12, 12)
output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
self.assertIsNotNone(output)
output = layers.upsampling_bilinear2d(x, scale=3)
self.assertIsNotNone(output)
print(str(program))
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
def PolygonBoxRestore(input):
shape = input.shape
batch_size = shape[0]
geo_channels = shape[1]
h = shape[2]
w = shape[3]
h_indexes = np.array(range(h) * w).reshape(
[w, h]).transpose()[np.newaxis, :] # [1, h, w]
w_indexes = np.array(range(w) * h).reshape(
[h, w])[np.newaxis, :] # [1, h, w]
indexes = np.concatenate(
(w_indexes, h_indexes))[np.newaxis, :] # [1, 2, h, w]
indexes = indexes.repeat(
[geo_channels / 2],
axis=0)[np.newaxis, :] # [1, geo_channels/2, 2, h, w]
indexes = indexes.repeat(
[batch_size], axis=0) # [batch_size, geo_channels/2, 2, h, w]
return indexes.reshape(
input.shape) - input # [batch_size, geo_channels, h, w]
class TestPolygonBoxRestoreOp(OpTest):
def config(self):
self.input_shape = (1, 8, 2, 2)
def setUp(self):
self.config()
self.op_type = "polygon_box_transform"
input = np.random.random(self.input_shape).astype("float32")
self.inputs = {'Input': input}
output = PolygonBoxRestore(input)
self.outputs = {'Output': output}
def test_check_output(self):
self.check_output()
class TestCase1(TestPolygonBoxRestoreOp):
def config(self):
self.input_shape = (2, 10, 3, 2)
class TestCase2(TestPolygonBoxRestoreOp):
def config(self):
self.input_shape = (3, 12, 4, 5)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle.v2.dataset.mnist as mnist
class TestPreprocessor(unittest.TestCase):
def setUp(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=32)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist_for_preprocessor_test.recordio', reader, feeder)
def test_main(self):
N = 10
img_expected_res = []
lbl_expected_res = []
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.io.open_recordio_file(
'./mnist_for_preprocessor_test.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
img, lbl = fluid.layers.io.read_file(data_file)
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for _ in range(N):
img_v, lbl_v = exe.run(fetch_list=[img, lbl])
img_expected_res.append(img_v / 2)
lbl_expected_res.append(lbl_v + 1)
img_actual_res = []
lbl_actual_res = []
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.io.open_recordio_file(
'./mnist_for_preprocessor_test.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
with preprocessor.block():
img, lbl = preprocessor.inputs()
img_out = img / 2
lbl_out = lbl + 1
preprocessor.outputs(img_out, lbl_out)
data_file = fluid.layers.io.double_buffer(preprocessor())
img, lbl = fluid.layers.io.read_file(data_file)
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for _ in range(N):
img_v, lbl_v = exe.run(fetch_list=[img, lbl])
img_actual_res.append(img_v)
lbl_actual_res.append(lbl_v)
for idx in range(N):
np.allclose(img_expected_res[idx], img_actual_res[idx])
np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
......@@ -107,7 +107,7 @@ class ControlFlowGraph(object):
# Repeatedly apply liveness updates until the algorithm stablize
# on a complete set live input vars and live output vars.
while True:
for i in range(self.op_size, 0, -1):
for i in reversed(range(self.op_size)):
live_in[i] = set(self._live_in[i])
live_out[i] = set(self._live_out[i])
for s in self._successors[i]:
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DocstringChecker is used to check python doc string's style."""
import six
import astroid
from pylint.checkers import BaseChecker, utils
from pylint.interfaces import IAstroidChecker
from collections import defaultdict
import re
def register(linter):
"""Register checkers."""
linter.register_checker(DocstringChecker(linter))
class Docstring(object):
"""Docstring class holds the parsed doc string elements.
"""
def __init__(self):
self.d = defaultdict(list) #name->[]
self.clear()
def clear(self):
self.d['Args'] = []
self.d['Examples'] = []
self.d['Returns'] = []
self.d['Raises'] = []
self.args = {} #arg_name->arg_type
def get_level(self, string, indent=' '):
level = 0
unit_size = len(indent)
while string[:unit_size] == indent:
string = string[unit_size:]
level += 1
return level
def parse(self, doc):
"""parse gets sections from doc
Such as Args, Returns, Raises, Examples s
Args:
doc (string): is the astroid node doc string.
Returns:
True if doc is parsed successfully.
"""
self.clear()
lines = doc.splitlines()
state = ("others", -1)
for l in lines:
c = l.strip()
if len(c) <= 0:
continue
level = self.get_level(l)
if c.startswith("Args:"):
state = ("Args", level)
elif c.startswith("Returns:"):
state = ("Returns", level)
elif c.startswith("Raises:"):
state = ("Raises", level)
elif c.startswith("Examples:"):
state = ("Examples", level)
else:
if level > state[1]:
self.d[state[0]].append(c)
continue
state = ("others", -1)
self.d[state[0]].append(c)
self._arg_with_type()
return True
def get_returns(self):
return self.d['Returns']
def get_raises(self):
return self.d['Raises']
def get_examples(self):
return self.d['Examples']
def _arg_with_type(self):
for t in self.d['Args']:
m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
if m:
self.args[m.group(1)] = m.group(2)
return self.args
class DocstringChecker(BaseChecker):
"""DosstringChecker is pylint checker to
check docstring style.
"""
__implements__ = (IAstroidChecker, )
POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
name = 'doc-string-checker'
symbol = "doc-string"
priority = -1
msgs = {
'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
'Used when a short doc string is on multiple lines'),
'W9002':
('Doc string does not end with "." period', symbol + "-end-with",
'Used when a doc string does not end with a period'),
'W9003': ('All args with their types must be mentioned in doc string',
symbol + "-with-all-args",
'Used when not all arguments are in the doc string '),
'W9005': ('Missing docstring or docstring is too short',
symbol + "-missing", 'Add docstring longer >=10'),
'W9006': ('Docstring indent error, use 4 space for indent',
symbol + "-indent-error", 'Use 4 space for indent'),
'W9007': ('You should add `Returns` in comments',
symbol + "-with-returns",
'There should be a `Returns` section in comments'),
'W9008': ('You should add `Raises` section in comments',
symbol + "-with-raises",
'There should be a `Raises` section in comments'),
}
options = ()
def visit_functiondef(self, node):
"""visit_functiondef checks Function node docstring style.
Args:
node (astroid.node): The visiting node.
Returns:
True if successful other wise False.
"""
self.check_doc_string(node)
if node.tolineno - node.fromlineno <= 10:
return True
if not node.doc:
return True
doc = Docstring()
doc.parse(node.doc)
self.all_args_in_doc(node, doc)
self.with_returns(node, doc)
self.with_raises(node, doc)
def visit_module(self, node):
self.check_doc_string(node)
def visit_classdef(self, node):
self.check_doc_string(node)
def check_doc_string(self, node):
self.missing_doc_string(node)
self.one_line(node)
self.has_period(node)
self.indent_style(node)
def missing_doc_string(self, node):
if node.tolineno - node.fromlineno <= 10:
return True
if node.doc is None or len(node.doc) < 10:
self.add_message('W9005', node=node, line=node.fromlineno)
return False
# FIXME(gongwb): give the docstring line-no
def indent_style(self, node, indent=4):
"""indent_style checks docstring's indent style
Args:
node (astroid.node): The visiting node.
indent (int): The default indent of style
Returns:
True if successful other wise False.
"""
if node.doc is None:
return True
doc = node.doc
lines = doc.splitlines()
for l in lines:
cur_indent = len(l) - len(l.lstrip())
if cur_indent % indent != 0:
self.add_message('W9006', node=node, line=node.fromlineno)
return False
return True
def one_line(self, node):
"""one_line checks if docstring (len < 40) is on one line.
Args:
node (astroid.node): The node visiting.
Returns:
True if successful otherwise False.
"""
doc = node.doc
if doc is None:
return True
if len(doc) > 40:
return True
elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
return True
else:
self.add_message('W9001', node=node, line=node.fromlineno)
return False
return True
def has_period(self, node):
"""has_period checks if one line doc end-with '.' .
Args:
node (astroid.node): the node is visiting.
Returns:
True if successful otherwise False.
"""
if node.doc is None:
return True
if len(node.doc.splitlines()) > 1:
return True
if not node.doc.strip().endswith('.'):
self.add_message('W9002', node=node, line=node.fromlineno)
return False
return True
def with_raises(self, node, doc):
"""with_raises checks if one line doc end-with '.' .
Args:
node (astroid.node): the node is visiting.
doc (Docstring): Docstring object.
Returns:
True if successful otherwise False.
"""
find = False
for t in node.body:
if not isinstance(t, astroid.Raise):
continue
find = True
break
if not find:
return True
if len(doc.get_raises()) == 0:
self.add_message('W9008', node=node, line=node.fromlineno)
return False
return True
def with_returns(self, node, doc):
"""with_returns checks if docstring comments what are returned .
Args:
node (astroid.node): the node is visiting.
doc (Docstring): Docstring object.
Returns:
True if successful otherwise False.
"""
find = False
for t in node.body:
if not isinstance(t, astroid.Return):
continue
find = True
break
if not find:
return True
if len(doc.get_returns()) == 0:
self.add_message('W9007', node=node, line=node.fromlineno)
return False
return True
def all_args_in_doc(self, node, doc):
"""all_args_in_doc checks if arguments are mentioned in doc
Args:
node (astroid.node): the node is visiting.
doc (Docstring): Docstring object
Returns:
True if successful otherwise False.
"""
args = []
for arg in node.args.get_children():
if (not isinstance(arg, astroid.AssignName)) \
or arg.name == "self":
continue
args.append(arg.name)
if len(args) <= 0:
return True
parsed_args = doc.args
if len(args) > 0 and len(parsed_args) <= 0:
print "debug:parsed args: ", parsed_args
self.add_message('W9003', node=node, line=node.fromlineno)
return False
for t in args:
if t not in parsed_args:
print t, " with (type) not in ", parsed_args
self.add_message('W9003', node=node, line=node.fromlineno)
return False
return True
#!/bin/bash
TOTAL_ERRORS=0
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export PYTHONPATH=$DIR:$PYTHONPATH
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
pylint --disable=all --load-plugins=docstring_checker \
--enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done
#exit $TOTAL_ERRORS
#For now, just warning:
exit 0
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import docstring_checker
import pylint.testutils
import astroid
import pytest
import sys
class TestDocstring(pylint.testutils.CheckerTestCase):
CHECKER_CLASS = docstring_checker.DocstringChecker
def test_one_line(self):
func_node = astroid.extract_node('''
def test():
"""get
news.
"""
if True:
return 5
return 5
''')
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 1
assert 'W9001' == got[0][0]
def test_one_line(self):
func_node = astroid.extract_node('''
def test():
"""get news"""
if True:
return 5
return 5
''')
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 1
assert 'W9002' == got[0][0]
def test_args(self):
func_node = astroid.extract_node('''
def test(scale, mean):
"""get news.
Args:
scale (int): scale is the number.
"""
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
''')
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 1
assert 'W9003' == got[0][0]
def test_missing(self):
func_node = astroid.extract_node('''
def test():
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
''')
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 1
assert 'W9005' == got[0][0]
def test_indent(self):
func_node = astroid.extract_node('''
def test():
""" get get get get get get get get
get get get get get get get get.
"""
pass
''')
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 1
assert 'W9006' == got[0][0]
def test_with_resturns(self):
func_node = astroid.extract_node('''
def test():
"""get news.
Args:
scale (int): scale is the number.
"""
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
return mean
''')
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 1
assert 'W9007' == got[0][0]
def test_with_raises(self):
func_node = astroid.extract_node('''
def test():
"""get news.
Args:
scale (int): scale is the number.
"""
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
mean=scale
raise ValueError('A very specific bad thing happened.')
''')
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 1
assert 'W9008' == got[0][0]
def test_no_message(self):
p = '''
def fc(input,
size,
num_flatten_dims=1,
param_attr=None,
bias_attr=None,
act=None,
name=None):
"""
**Fully Connected Layer**
The fully connected layer can take multiple tensors as its inputs. It
creates a variable called weights for each input tensor, which represents
a fully connected weight matrix from each input unit to each output unit.
The fully connected layer multiplies each input tensor with its coresponding
weight to produce an output Tensor. If multiple input tensors are given,
the results of multiple multiplications will be sumed up. If bias_attr is
not None, a bias variable will be created and added to the output. Finally,
if activation is not None, it will be applied to the output as well.
This process can be formulated as follows:
Args:
input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
the input tensor(s) is at least 2.
size(int): The number of output units in this layer.
num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
two dimensions. If this happens, the multidimensional tensor will first be flattened
into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
dimensions will be flatten to form the first dimension of the final matrix (height of
the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
form the second dimension of the final matrix (width of the matrix). For example, suppose
`X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
of this layer. If it is set to None, no bias will be added to the output units.
act (str, default None): Activation to be applied to the output of this layer.
name (str, default None): The name of this layer.
Returns:
A tensor variable storing the transformation result.
Raises:
ValueError: If rank of the input tensor is less than 2.
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
fc = fluid.layers.fc(input=data, size=1000, act="tanh")
"""
raise ValueError('A very specific bad thing happened.')
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
size = 1
return size
'''
func_node = astroid.extract_node(p)
self.checker.visit_functiondef(func_node)
got = self.linter.release_messages()
assert len(got) == 0
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册