提交 b2cb7c6f 编写于 作者: T tangwei12

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into new_api_about_cpkt

......@@ -41,7 +41,6 @@ option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FO
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
......@@ -58,8 +57,10 @@ option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......@@ -156,7 +157,6 @@ include(cupti)
include(configure) # add paddle env configuration
include(generic) # simplify cmake module
include(package) # set paddle packages
include(cpplint) # set paddle c++ style
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(rdma) # set rdma libraries
......@@ -205,7 +205,7 @@ endif(USE_NNPACK)
add_subdirectory(proto)
if(NOT MOBILE_INFERENCE)
if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
# "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer.
add_subdirectory(paddle/optimizer)
......@@ -233,3 +233,7 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
if (WITH_CONTRIB)
add_subdirectory(paddle/contrib)
endif()
......@@ -101,6 +101,3 @@ RUN echo 'root:root' | chpasswd
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
EXPOSE 22
# development image default do build work
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
......@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
# Cluster Training Benchmark
## Setup
- Platform
- Kubernetes: v1.6.2
- Linux Kernel: v3.10.0
- Resource
- CPU: 10 Cores per Pod
- Memory: 5GB per Pod
- Docker Image
We use different base Docker Image to run the benchmark on Kubernetes:
- PaddlePaddle v2: paddlepaddle/paddle:0.11.0
- PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
- TensorFlow: tensorflow/tensorflow:1.5.0-rc0
- Model
vgg16 is used in this benchmark.
## Cases
- Variable
- Batch Size of training data.
- PServer count of the training job.
- The number of trainers.
- Invariant
- The resource of trainer/pserver Pod.
### Measure the Performance for Different Batch Size
- PServer Count: 40
- Trainer Count: 100
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure the Performance for Different PServer Count
- Trainer Count: 100
- Batch Size: 64
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>10</th>
<th>20</th>
<th>40 </th>
<th>60</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure Parallel Efficiency By Increasing Trainer Count
- PServer Count: 20
- Batch Size: 64
- Metrics:
$S = \div(T1, TN)$
which S is the ratio of T1 over TN, training time of 1 and N trainers.
The parallel efficiency is:
$E = \div(S, N)$
<table>
<thead>
<tr>
<th>Trainer Counter </th>
<th>1</th>
<th>10</th>
<th>20 </th>
<th>30</th>
<th>40</th>
<th>50</th>
<th>60 </th>
<th>70</th>
<th>80</th>
<th>90</th>
<th>100 </th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
## Reproduce the benchmark
TODO
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
# you can get mirror list here:
# https://launchpad.net/ubuntu/+archivemirrors
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
RUN pip install -U kubernetes opencv-python
RUN pip install paddlepaddle
# if network is slowly, you may need to add proxy here.
# ENV https_proxy=
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# unset proxy if it is setted.
# ENV https_proxy=""
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
# tf k8s
RUN pip install tensorflow==1.4.0
ADD tf_k8s /usr/bin
RUN chmod +x /usr/bin/tf_k8s
ADD vgg16_tf.py /workspace/
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Blas settings
Setting environment variable: `MKL_NUM_THREADS=1`.
### Single Node Single Thread
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 15.44 </td>
<td> 16.32 </td>
<td> 16.74 </td>
<td> 16.79 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 15.97 </td>
<td> 17.04 </td>
<td> 17.60 </td>
<td> 17.83 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> 9.09 </td>
<td> 9.10 </td>
<td> 9.24 </td>
<td> 8.66 </td>
</tr>
</tbody>
</table>
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 190.20 </td>
<td> 222.15 </td>
<td> 247.40 </td>
<td> 258.18 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 170.96 </td>
<td> 233.71 </td>
<td> 256.14 </td>
<td> 329.23 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Trainer Count </th>
<th>20</th>
<th>40</th>
<th>80</th>
<th>100</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 263.29 (78.64%) </td>
<td> 518.80 (77.47%) </td>
<td> 836.26 (62.44%) </td>
<td> 1019.29 (60.89%) </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 326.85 (92.85%) </td>
<td> 534.58 (75.93%) </td>
<td> 853.30 (60.60%) </td>
<td> 1041.99 (59.20%) </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>3</th>
<th>6</th>
<th>10</th>
<th>20</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid(should fix in next PR) </td>
<td> 589.1 </td>
<td> 592.6 </td>
<td> 656.4 </td>
<td> 655.8 </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 593.4 </td>
<td> 791.3 </td>
<td> 729.7 </td>
<td> 821.7 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
#!/bin/bash
# Update to point to the source file.
VGG_SRC="vgg16_fluid.py"
export TRAINING_ROLE=PSERVER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
# Need to wait for the ps to start first.
sleep 10
echo "done start ps"
export TRAINING_ROLE=TRAINER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
#!/bin/bash
check_trainer_ret() {
ret=$1
stdbuf -oL echo "job returned $ret...setting pod return message..."
stdbuf -oL echo "==============================="
if [ $ret -eq 136 ] ; then
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
elif [ $ret -eq 139 ] ; then
echo "Segmentation Fault" > /dev/termination-log
elif [ $ret -eq 1 ] ; then
echo "General Error" > /dev/termination-log
elif [ $ret -eq 134 ] ; then
echo "Program Abort" > /dev/termination-log
fi
stdbuf -oL echo "termination log wroted..."
exit $ret
}
g_pservers=""
g_trainers=""
wait_running_pods(){
pserver_label="tf-job-pserver=${JOB_NAME}"
trainer_label="tf-job-trainer=${JOB_NAME}"
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
}
start_tf_pserver(){
wait_running_pods
label="tf-job-pserver=${JOB_NAME}"
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
}
start_tf_trainer(){
wait_running_pods
label="tf-job-trainer=${JOB_NAME}"
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
check_trainer_ret $?
}
start_tf(){
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
start_tf_trainer
else
start_tf_pserver
fi
}
usage() {
echo "usage: tf_k8s [<args>]:"
echo " start_tf Start tensorflow jobs"
}
case "$1" in
start_tf)
start_tf
;;
--help)
usage
;;
*)
usage
;;
esac
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-tf-pserver
spec:
replicas: 10
template:
metadata:
labels:
tf-job-pserver: vgg16job-tf
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: ENTRY
value: "python vgg16_tf.py"
- name: JOB_NAME
value: vgg16job-tf
- name: PSERVERS_NUM
value: "10"
- name: TF_JOB_NAME
value: "ps"
- name: TRAINERS_NUM
value: "20"
- name: BATCH_SIZE
value: "128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-tf-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
tf-job-trainer: vgg16job-tf
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: JOB_NAME
value: vgg16job-tf
- name: TF_JOB_NAME
value: "worker"
- name: ENTRY
value: "python vgg16_tf.py"
- name: PSERVERS_NUM
value: "10"
- name: BATCH_SIZE
value: "128"
- name: TRAINERS_NUM
value: "20"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
import sys
import time
import numpy as np
import paddle.v2 as paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
import argparse
import functools
import os
from paddle.fluid import debuger
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument('--device_id', type=int, default=0, help="The device id.")
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--local',
type=str2bool,
default=True,
help='Whether to run as local mode.')
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--trainer_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--profile", action='store_true', help="If set, profile a few steps.")
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index", type=int, default=0, help="Index of task within the job")
args = parser.parse_args()
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2
def main():
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
data_shape = [3, 32, 32]
else:
data_shape = [32, 32, 3]
else:
classdim = 102
if args.data_format == 'NCHW':
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
# Input data
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(batch_acc)
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
# Initialize executor
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
args.device_id)
exe = fluid.Executor(place)
# test
def test(exe):
test_pass_acc = fluid.average.WeightedAverage()
for batch_id, data in enumerate(test_reader()):
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
outs = exe.run(inference_program,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[batch_acc, batch_size])
test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
return test_pass_acc.eval()
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
train_pass_acc = fluid.average.WeightedAverage()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
train_pass_acc.reset()
def run_step(batch_id, data):
img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc, b_size = exe.run(
trainer_prog,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size])
return loss, acc, b_size
if args.profile and args.task_index == 0:
# warmup.
for batch_id, data in enumerate(train_reader()):
if batch_id > 5: break
run_step(batch_id, data)
with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
for batch_id, data in enumerate(train_reader()):
if batch_id > 5: break
run_step(batch_id, data)
for batch_id, data in enumerate(train_reader()):
ts = time.time()
loss, acc, b_size = run_step(batch_id, data)
iters += 1
num_samples += len(data)
train_pass_acc.add(value=acc, weight=b_size)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
"Speed = %.2f img/s" % (pass_id, iters, loss, acc,
len(data) / (time.time() - ts))
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
pass_train_acc = train_pass_acc.eval()
pass_test_acc = test(exe)
print("Task:%d Pass = %d, Training performance = %f imgs/s, "
"Train accuracy = %f, Test accuracy = %f\n" %
(args.task_index, pass_id, num_samples / pass_elapsed,
pass_train_acc, pass_test_acc))
if args.local:
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program())
else:
trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=args.task_index,
pservers=args.ps_hosts,
trainers=trainers)
if training_role == "PSERVER":
current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
"PADDLE_INIT_PORT")
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
paddle.dataset.flowers.test(),
batch_size=args.batch_size)
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
# TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
exe.run(fluid.default_startup_program())
train_loop(exe, trainer_prog)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
print_arguments()
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in TensorFlow
You can get distribution example template structure here:
https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
https://www.tensorflow.org/deploy/distributed
"""
import tensorflow as tf
import paddle.v2 as paddle
import numpy as np
import argparse
import time
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument(
'--data_format',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, NCHW=[batch, channels, height, width].'
'Only support NHWC right now.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--worker_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--job_name", type=str, default="", help="One of 'worker', 'ps'")
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index", type=int, default=0, help="Index of task within the job")
args = parser.parse_args()
class VGG16Model(object):
def __init__(self):
self.parameters = []
def batch_norm_relu(self, inputs, is_training):
"""Performs a batch normalization followed by a ReLU."""
# We set fused=True for a significant speed boost. See
# https://www.tensorflow.org/speed/speed_guide#common_fused_ops
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if args.data_format == 'NCHW' else -1,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
inputs = tf.nn.relu(inputs)
return inputs
def conv_bn_layer(self,
name,
images,
kernel_shape,
is_training,
drop_rate=0.0):
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
kernel_shape, dtype=tf.float32, stddev=1e-1),
name='weights')
conv = tf.nn.conv2d(
images,
kernel, [1, 1, 1, 1],
data_format=args.data_format,
padding='SAME')
biases = tf.Variable(
tf.constant(
0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(conv, biases)
out = self.batch_norm_relu(out, is_training)
out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
return out
def fc_layer(self, name, inputs, shape):
with tf.name_scope(name) as scope:
fc_w = tf.Variable(
tf.truncated_normal(
shape, dtype=tf.float32, stddev=1e-1),
name='weights')
fc_b = tf.Variable(
tf.constant(
0.0, shape=[shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
return out
def network(self, images, class_dim, is_training):
""" VGG16 model structure.
TODO(kuke): enable this network to support the 'NCHW' data format
"""
# conv1
conv1_1 = self.conv_bn_layer(
'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
conv1_2 = self.conv_bn_layer(
'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
# pool1
pool1 = tf.nn.max_pool(
conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1')
# conv2
conv2_1 = self.conv_bn_layer(
'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
conv2_2 = self.conv_bn_layer(
'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
# pool2
pool2 = tf.nn.max_pool(
conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2')
# conv3
conv3_1 = self.conv_bn_layer(
'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
conv3_2 = self.conv_bn_layer(
'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
conv3_3 = self.conv_bn_layer(
'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
# pool3
pool3 = tf.nn.max_pool(
conv3_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool3')
# conv4
conv4_1 = self.conv_bn_layer(
'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
conv4_2 = self.conv_bn_layer(
'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv4_3 = self.conv_bn_layer(
'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool4
pool4 = tf.nn.max_pool(
conv4_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# conv5
conv5_1 = self.conv_bn_layer(
'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_2 = self.conv_bn_layer(
'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_3 = self.conv_bn_layer(
'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool5
pool5 = tf.nn.max_pool(
conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# flatten
shape = int(np.prod(pool5.get_shape()[1:]))
pool5_flat = tf.reshape(pool5, [-1, shape])
# fc1
drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
fc1 = self.fc_layer('fc1', drop, [shape, 512])
# fc2
bn = self.batch_norm_relu(fc1, is_training)
drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
fc2 = self.fc_layer('fc2', drop, [512, 512])
fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
return fc3
def run_benchmark(cluster_spec, server):
"""Run benchmark on cifar10 or flowers."""
if args.data_set == "cifar10":
class_dim = 10
raw_shape = (3, 32, 32)
dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
None, 3, 32, 32)
else:
class_dim = 102
raw_shape = (3, 224, 224)
dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
None, 3, 224, 224)
device = tf.train.replica_device_setter(
worker_device="/job:worker/task:{}".format(args.task_index),
cluster=cluster_spec)
with tf.device(device):
images = tf.placeholder(tf.float32, shape=dat_shape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
vgg16 = VGG16Model()
logits = vgg16.network(images, class_dim, is_training)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
avg_loss = tf.reduce_mean(loss)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
global_step = tf.Variable(0, name='global_step', trainable=False)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_loss, global_step=global_step)
summary_op = tf.summary.merge_all()
init_op = tf.global_variables_initializer()
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
buf_size=5120),
batch_size=args.batch_size)
# test
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
return np.mean(test_accs)
config = tf.ConfigProto(
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1,
log_device_placement=True)
config.gpu_options.allow_growth = True
hooks = [tf.train.StopAtStepHook(last_step=1000000)]
with tf.train.MonitoredTrainingSession(
master=server.target,
is_chief=(args.task_index == 0),
hooks=hooks,
config=config) as sess:
iters, num_samples, start_time = 0, 0, 0.0
for pass_id in range(args.num_passes):
# train
num_samples = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
train_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
train_labels = np.array(map(lambda x: x[1], data)).astype(
'int64')
iter_begin_time = time.time()
_, loss, acc = sess.run([train_op, avg_loss, accuracy],
feed_dict={
images: train_images,
labels: train_labels,
is_training: True
})
iters += 1
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
% (pass_id, iters, loss, acc,
len(data) / (time.time() - iter_begin_time)))
num_samples += len(data)
train_elapsed = time.time() - start_time
# test
pass_test_acc = test()
print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, pass_test_acc))
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
print_arguments()
ps_hosts = args.ps_hosts.split(",")
worker_hosts = args.worker_hosts.split(",")
# Create a cluster from the parameter server and worker hosts.
cluster_spec = tf.train.ClusterSpec({
"ps": ps_hosts,
"worker": worker_hosts
})
# Create and start a server for the local task.
server = tf.train.Server(
cluster_spec, job_name=args.job_name, task_index=args.task_index)
if args.job_name == "ps":
print("start pserver")
server.join()
elif args.job_name == "worker":
print("start worker")
run_benchmark(cluster_spec, server)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()
......@@ -94,6 +94,10 @@ def parse_args():
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument(
'--update_method',
type=str,
......@@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe.run(train_prog)
return
if args.use_fake_data:
raise Exception(
"fake data is not supported in single GPU test for now.")
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_prog)
......@@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_acc, args, train_prog, startup_prog, nccl_id_var,
num_trainers, trainer_id):
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
# generate fake:
if args.use_fake_data:
for var in feed_var_list:
v = startup_prog.global_block().clone_variable(var)
var.persistable = True
v.persistable = True
real_shape = list(var.shape)
real_shape[0] = args.batch_size / args.gpus
startup_prog.global_block().append_op(
outputs={"Out": v},
type="fill_constant",
attrs={"shape": real_shape,
"value": 1.0,
"dtype": var.dtype})
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
if nccl_id_var and trainer_id == 0:
#FIXME(wuyi): wait other trainer to start listening
time.sleep(30)
startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy()
......@@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
exec_strategy=strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in range(args.pass_num):
num_samples = 0
......@@ -271,6 +300,9 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_samples = 0
if iters == args.iterations:
break
if args.use_fake_data:
loss, = exe.run([avg_loss.name])
else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
......
......@@ -112,6 +112,7 @@ def gen_job():
envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
envs.append({
......
......@@ -54,5 +54,13 @@ envs = [
"fieldPath": "status.podIP"
}
}
},
{
"name": "PADDLE_CURRENT_IP",
"valueFrom": {
"fieldRef": {
"fieldPath": "status.podIP"
}
}
}
]
......@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(EIGEN_USE_THREADS)
add_definitions(-DEIGEN_USE_THREADS)
endif(EIGEN_USE_THREADS)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
......
# util to check C++ file style
# * it basically use google cpplint.py.
# * It provide "add_style_check_target" for cmake.
# Usage see add_style_check_target's document
#
# TODO(yuyang18): Add python style check.
set(STYLE_FILTER)
# diable unwanted filters
# paddle do not indent public/potected/private in class
set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,")
# paddle use mutable reference. BUT IT IS NOT RECOMMANDED
set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,")
# paddle use relative path for include.
set(STYLE_FILTER "${STYLE_FILTER}-build/include,")
# paddle use <thread>, <mutex>, etc.
set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,")
# paddle use c style casting. BUT IT IS NOT RECOMMANDED
set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
# IGNORE SOME FILES
set(IGNORE_PATTERN
.*ImportanceSampler.*
.*cblas\\.h.*
.*\\.pb\\.txt
.*MultiDataProvider.*
.*pb.*
.*pybind.h)
# add_style_check_target
#
# attach check code style step for target.
#
# first argument: target name to attach
# rest arguments: source list to check code style.
#
# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
macro(add_style_check_target TARGET_NAME)
if(WITH_STYLE_CHECK)
set(SOURCES_LIST ${ARGN})
list(REMOVE_DUPLICATES SOURCES_LIST)
foreach(filename ${SOURCES_LIST})
foreach(pattern ${IGNORE_PATTERN})
if(filename MATCHES ${pattern})
list(REMOVE_ITEM SOURCES_LIST ${filename})
endif()
endforeach()
endforeach()
if(SOURCES_LIST)
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
${SOURCES_LIST}
COMMENT "cpplint: Checking source code style"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endif()
endmacro()
......@@ -23,17 +23,20 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
include(ProcessorCount)
ProcessorCount(NUM_OF_PROCESSOR)
IF(APPLE)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
ELSE()
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
ENDIF()
ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git"
GIT_TAG "v1.10.x"
URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
......
......@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
${OPTIONAL_ARGS}
-Dprotobuf_BUILD_TESTS=OFF
-DCMAKE_SKIP_RPATH=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
......
......@@ -206,8 +206,6 @@ function(cc_library TARGET_NAME)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
else(cc_library_SRCS)
if(cc_library_DEPS)
merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
......@@ -271,7 +269,6 @@ function(nv_library TARGET_NAME)
list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
else(nv_library_SRCS)
if (nv_library_DEPS)
merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
......@@ -344,7 +341,6 @@ function(hip_library TARGET_NAME)
list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
else(hip_library_SRCS)
if (hip_library_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
......
......@@ -172,6 +172,7 @@ add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})
# paddle fluid version
execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
set(version_file ${FLUID_INSTALL_DIR}/version.txt)
file(WRITE ${version_file}
......
......@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
set(IMPORT_PADDLE_STRING "")
set(IMPORT_PADDLEV2_STRING "")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
......@@ -27,8 +30,6 @@ sphinx_add_target(paddle_fluid_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
......@@ -50,6 +51,4 @@ sphinx_add_target(paddle_fluid_docs_cn
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
add_subdirectory(api)
......@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
set(IMPORT_PADDLE_STRING "import paddle")
set(IMPORT_PADDLEV2_STRING "import paddle.v2")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
====
clip
====
ErrorClipByValue
----------------
.. autoclass:: paddle.fluid.clip.ErrorClipByValue
:members:
:noindex:
GradientClipByValue
-------------------
.. autoclass:: paddle.fluid.clip.GradientClipByValue
:members:
:noindex:
GradientClipByNorm
------------------
.. autoclass:: paddle.fluid.clip.GradientClipByNorm
:members:
:noindex:
GradientClipByGlobalNorm
------------------------
.. autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
:members:
:noindex:
append_gradient_clip_ops
------------------------
.. autofunction:: paddle.fluid.clip.append_gradient_clip_ops
:noindex:
error_clip_callback
-------------------
.. autofunction:: paddle.fluid.clip.error_clip_callback
:noindex:
......@@ -5,24 +5,3 @@
evaluator
=========
ChunkEvaluator
--------------
.. autoclass:: paddle.fluid.evaluator.ChunkEvaluator
:members:
:noindex:
EditDistance
--------------
.. autoclass:: paddle.fluid.evaluator.EditDistance
:members:
:noindex:
DetectionMAP
--------------
.. autoclass:: paddle.fluid.evaluator.DetectionMAP
:members:
:noindex:
......@@ -30,3 +30,9 @@ switch_scope
.. autofunction:: paddle.fluid.executor.switch_scope
:noindex:
fetch_var
---------
.. autofunction:: paddle.fluid.executor.fetch_var
:noindex:
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
do
python gen_doc.py ${module} > ${module}.rst
done
......@@ -9,8 +9,9 @@ Fluid
data_feeder.rst
executor.rst
initializer.rst
evaluator.rst
metrics.rst
nets.rst
clip.rst
optimizer.rst
param_attr.rst
profiler.rst
......
......@@ -33,11 +33,16 @@ Xavier
:members:
:noindex:
MSRA
------
force_init_on_cpu
-----------------
.. autoclass:: paddle.fluid.initializer.MSRA
:members:
.. autofunction:: paddle.fluid.initializer.force_init_on_cpu
:noindex:
init_on_cpu
-----------
.. autofunction:: paddle.fluid.initializer.init_on_cpu
:noindex:
ConstantInitializer
......@@ -68,9 +73,3 @@ XavierInitializer
:members:
:noindex:
MSRAInitializer
-----------------
.. autoclass:: paddle.fluid.initializer.MSRAInitializer
:members:
:noindex:
......@@ -55,6 +55,13 @@ While
:members:
:noindex:
Switch
------
.. autoclass:: paddle.fluid.layers.Switch
:members:
:noindex:
lod_rank_table
--------------
......@@ -67,12 +74,6 @@ max_sequence_len
.. autofunction:: paddle.fluid.layers.max_sequence_len
:noindex:
topk
----
.. autofunction:: paddle.fluid.layers.topk
:noindex:
lod_tensor_to_array
-------------------
......@@ -109,6 +110,12 @@ less_than
.. autofunction:: paddle.fluid.layers.less_than
:noindex:
equal
-----
.. autofunction:: paddle.fluid.layers.equal
:noindex:
array_read
----------
......@@ -212,6 +219,42 @@ Send
.. autofunction:: paddle.fluid.layers.Send
:noindex:
open_recordio_file
------------------
.. autofunction:: paddle.fluid.layers.open_recordio_file
:noindex:
open_files
----------
.. autofunction:: paddle.fluid.layers.open_files
:noindex:
read_file
---------
.. autofunction:: paddle.fluid.layers.read_file
:noindex:
shuffle
-------
.. autofunction:: paddle.fluid.layers.shuffle
:noindex:
batch
-----
.. autofunction:: paddle.fluid.layers.batch
:noindex:
double_buffer
-------------
.. autofunction:: paddle.fluid.layers.double_buffer
:noindex:
nn
==
......@@ -281,12 +324,6 @@ square_error_cost
.. autofunction:: paddle.fluid.layers.square_error_cost
:noindex:
accuracy
--------
.. autofunction:: paddle.fluid.layers.accuracy
:noindex:
chunk_eval
----------
......@@ -311,6 +348,18 @@ sequence_pool
.. autofunction:: paddle.fluid.layers.sequence_pool
:noindex:
sequence_softmax
----------------
.. autofunction:: paddle.fluid.layers.sequence_softmax
:noindex:
softmax
-------
.. autofunction:: paddle.fluid.layers.softmax
:noindex:
pool2d
------
......@@ -323,12 +372,6 @@ batch_norm
.. autofunction:: paddle.fluid.layers.batch_norm
:noindex:
layer_norm
----------
.. autofunction:: paddle.fluid.layers.layer_norm
:noindex:
beam_search_decode
------------------
......@@ -377,6 +420,12 @@ reduce_min
.. autofunction:: paddle.fluid.layers.reduce_min
:noindex:
reduce_prod
-----------
.. autofunction:: paddle.fluid.layers.reduce_prod
:noindex:
sequence_first_step
-------------------
......@@ -425,6 +474,12 @@ matmul
.. autofunction:: paddle.fluid.layers.matmul
:noindex:
topk
----
.. autofunction:: paddle.fluid.layers.topk
:noindex:
warpctc
-------
......@@ -473,6 +528,60 @@ multiplex
.. autofunction:: paddle.fluid.layers.multiplex
:noindex:
layer_norm
----------
.. autofunction:: paddle.fluid.layers.layer_norm
:noindex:
softmax_with_cross_entropy
--------------------------
.. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
:noindex:
smooth_l1
---------
.. autofunction:: paddle.fluid.layers.smooth_l1
:noindex:
one_hot
-------
.. autofunction:: paddle.fluid.layers.one_hot
:noindex:
autoincreased_step_counter
--------------------------
.. autofunction:: paddle.fluid.layers.autoincreased_step_counter
:noindex:
reshape
-------
.. autofunction:: paddle.fluid.layers.reshape
:noindex:
lod_reset
---------
.. autofunction:: paddle.fluid.layers.lod_reset
:noindex:
lrn
---
.. autofunction:: paddle.fluid.layers.lrn
:noindex:
pad
---
.. autofunction:: paddle.fluid.layers.pad
:noindex:
label_smooth
------------
......@@ -480,7 +589,7 @@ label_smooth
:noindex:
roi_pool
---------
--------
.. autofunction:: paddle.fluid.layers.roi_pool
:noindex:
......@@ -501,18 +610,6 @@ mul
.. autofunction:: paddle.fluid.layers.mul
:noindex:
reshape
-------
.. autofunction:: paddle.fluid.layers.reshape
:noindex:
pad
---
.. autofunction:: paddle.fluid.layers.pad
:noindex:
scale
-----
......@@ -579,10 +676,70 @@ clip_by_norm
.. autofunction:: paddle.fluid.layers.clip_by_norm
:noindex:
sequence_softmax
----------------
logical_and
-----------
.. autofunction:: paddle.fluid.layers.sequence_softmax
.. autofunction:: paddle.fluid.layers.logical_and
:noindex:
logical_or
----------
.. autofunction:: paddle.fluid.layers.logical_or
:noindex:
logical_xor
-----------
.. autofunction:: paddle.fluid.layers.logical_xor
:noindex:
logical_not
-----------
.. autofunction:: paddle.fluid.layers.logical_not
:noindex:
uniform_random
--------------
.. autofunction:: paddle.fluid.layers.uniform_random
:noindex:
uniform_random_batch_size_like
------------------------------
.. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
:noindex:
gaussian_random
---------------
.. autofunction:: paddle.fluid.layers.gaussian_random
:noindex:
gaussian_random_batch_size_like
-------------------------------
.. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
:noindex:
cumsum
------
.. autofunction:: paddle.fluid.layers.cumsum
:noindex:
scatter
-------
.. autofunction:: paddle.fluid.layers.scatter
:noindex:
sum
---
.. autofunction:: paddle.fluid.layers.sum
:noindex:
sigmoid
......@@ -651,6 +808,18 @@ floor
.. autofunction:: paddle.fluid.layers.floor
:noindex:
cos
---
.. autofunction:: paddle.fluid.layers.cos
:noindex:
sin
---
.. autofunction:: paddle.fluid.layers.sin
:noindex:
round
-----
......@@ -834,4 +1003,9 @@ dice_loss
.. autofunction:: paddle.fluid.layers.dice_loss
:noindex:
upsampling_bilinear2d
____
.. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=======
metrics
=======
MetricBase
----------
.. autoclass:: paddle.fluid.metrics.MetricBase
:members:
:noindex:
CompositeMetric
---------------
.. autoclass:: paddle.fluid.metrics.CompositeMetric
:members:
:noindex:
Accuracy
--------
.. autoclass:: paddle.fluid.metrics.Accuracy
:members:
:noindex:
ChunkEvaluator
--------------
.. autoclass:: paddle.fluid.metrics.ChunkEvaluator
:members:
:noindex:
EditDistance
------------
.. autoclass:: paddle.fluid.metrics.EditDistance
:members:
:noindex:
DetectionMAP
------------
.. autoclass:: paddle.fluid.metrics.DetectionMAP
:members:
:noindex:
Auc
---
.. autoclass:: paddle.fluid.metrics.Auc
:members:
:noindex:
......@@ -111,6 +111,7 @@ DecayedAdagradOptimizer
:members:
:noindex:
AdadeltaOptimizer
-----------------
......@@ -118,9 +119,17 @@ AdadeltaOptimizer
:members:
:noindex:
RMSPropOptimizer
-----------------
.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
:members:
:noindex:
Optimizer
---------
.. autoclass:: paddle.fluid.optimizer.Optimizer
:members:
:noindex:
......@@ -11,6 +11,13 @@ append_regularization_ops
.. autofunction:: paddle.fluid.regularizer.append_regularization_ops
:noindex:
WeightDecayRegularizer
----------------------
.. autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
:members:
:noindex:
L1Decay
-------
......@@ -26,15 +33,16 @@ L2Decay
:noindex:
L1DecayRegularizer
---------------------
------------------
.. autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
:members:
:noindex:
L2DecayRegularizer
---------------------
------------------
.. autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
:members:
:noindex:
......@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
set(IMPORT_PADDLE_STRING "")
set(IMPORT_PADDLEV2_STRING "")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
......@@ -27,8 +30,6 @@ sphinx_add_target(paddle_mobile_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
......@@ -49,5 +50,3 @@ sphinx_add_target(paddle_mobile_docs_cn
${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
移动端
=====
======
.. toctree::
:maxdepth: 1
......
......@@ -16,8 +16,8 @@ import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex
from recommonmark import parser, transform
import paddle
import paddle.v2
@IMPORT_PADDLE_STRING@
@IMPORT_PADDLEV2_STRING@
MarkdownParser = parser.CommonMarkParser
AutoStructify = transform.AutoStructify
......
......@@ -16,8 +16,8 @@ import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex
from recommonmark import parser, transform
import paddle
import paddle.v2
@IMPORT_PADDLE_STRING@
@IMPORT_PADDLEV2_STRING@
MarkdownParser = parser.CommonMarkParser
......
......@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
set(IMPORT_PADDLE_STRING "")
set(IMPORT_PADDLEV2_STRING "")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
......@@ -27,8 +30,6 @@ sphinx_add_target(paddle_v2_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
......@@ -50,6 +51,4 @@ sphinx_add_target(paddle_v2_docs_cn
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
add_subdirectory(api)
......@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
set(IMPORT_PADDLE_STRING "import paddle")
set(IMPORT_PADDLEV2_STRING "import paddle.v2")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
......
......@@ -19,8 +19,8 @@
----------------
PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到,您也可以
在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_ 找到 paddle_manylinux_devel
可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到,您也可以
在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。
如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
......@@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
# 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像
docker build -t paddle:dev .
# 3. 执行下面的命令编译CPU-Only的二进制
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步)
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。如果使用自行
构建的镜像(上述第4步)会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
最后的执行脚本的命令。
注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
......@@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` ):
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
bash /paddle/paddle/scripts/docker/build.sh
cd /paddle/build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
./paddle/scripts/paddle_build.sh build
cd build
ctest -R test_sum_op -V
.. _faq_docker:
......@@ -116,11 +114,10 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
```emacs
.. code-block:: emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
(setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
......
......@@ -23,7 +23,7 @@ You need to use Docker to build PaddlePaddle
to avoid installing dependencies by yourself. We have several pre-built
Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
you can also find how to build and use paddle_manylinux_devel Docker image from
`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_
`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
Or you can build your own image from source as the optional step below:
.. code-block:: bash
......@@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
# 2. Optional: build development docker image from source
docker build -t paddle:dev .
# 3. Run the following command to build a CPU-Only binaries
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
NOTE: The above command try to mount the current working directory (root directory of source code)
into :code:`/paddle` directory inside docker container. If you are using your own image
(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
command in step 3.
into :code:`/paddle` directory inside docker container.
When the compile finishes, you can get the output whl package under
build/python/dist, then you can choose to install the whl on local
......@@ -74,21 +72,21 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
If you wish to run only one unit test, like :code:`test_sum_op`:
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
bash /paddle/paddle/scripts/docker/build.sh
cd /paddle/build
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
./paddle/scripts/paddle_build.sh build
cd build
ctest -R test_sum_op -V
.. _faq_docker:
Frequently Asked Questions
----------------
---------------------------
- What is Docker?
......@@ -118,11 +116,10 @@ Frequently Asked Questions
Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file:
```emacs
.. code-block:: emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
(setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
......@@ -145,7 +142,7 @@ Frequently Asked Questions
.. _compile_deps:
Appendix: Compile Dependencies
----------------
-------------------------------
PaddlePaddle need the following dependencies when compiling, other dependencies
will be downloaded automatically.
......@@ -166,11 +163,11 @@ will be downloaded automatically.
.. _build_options:
Appendix: Build Options
----------------
-------------------------
Build options include whether build binaries for CPU or GPU, which BLAS
library to use etc. You may pass these settings when running cmake.
For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__
You can add :code:`-D` argument to pass such options, like:
......@@ -219,7 +216,7 @@ keep on with latest cuDNN versions. Be sure to run with the same version of cuDN
you built.
Pass Compile Options
++++++++++++++
++++++++++++++++++++++
You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
When running cmake command, it will search system paths like
......
......@@ -73,6 +73,7 @@
当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码:
.. code-block:: bash
docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
cd /work
python train.py
......@@ -97,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
国内用户可以使用下面的镜像源来加速访问:
.. code-block: bash
.. code-block:: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
......
......@@ -80,6 +80,7 @@ Also, you can go into the container shell, run or debug your code
interactively:
.. code-block:: bash
docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
cd /work
python train.py
......@@ -104,7 +105,7 @@ We provide a packaged book image, simply issue the command:
For users in China, we provide a faster mirror:
.. code-block: bash
.. code-block:: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
......
......@@ -6,7 +6,7 @@
PaddlePaddle针对不同的用户群体提供了多种安装方式。
专注深度学习模型开发
-----------------
--------------------
PaddlePaddle提供了多种python wheel包,可通过pip一键安装:
......@@ -18,7 +18,7 @@ PaddlePaddle提供了多种python wheel包,可通过pip一键安装:
这是最便捷的安装方式,请根据机器配置和系统选择对应的安装包。
关注底层框架
----------
-------------
PaddlePaddle提供了基于Docker的安装方式,请参照以下教程:
......@@ -45,7 +45,7 @@ PaddlePaddle提供了基于Docker的安装方式,请参照以下教程:
常见问题汇总
-----------
--------------
如果在安装过程中遇到了问题,请先尝试在下面的页面寻找答案:
......
install and Compile
==========
======================
.. _install_steps:
PaddlePaddle provides various methods of installation for many different users
Focus on Deep Learning Model Development
-----------------
----------------------------------------
PaddlePaddle provides lots of packages of python wheel , that pip can install:
......@@ -18,7 +18,7 @@ PaddlePaddle provides lots of packages of python wheel , that pip can install:
This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
Follow the Bottom Frame
----------
------------------------
PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
......
......@@ -55,11 +55,11 @@ paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版
:header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
:widths: 1, 3, 3
"cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
"cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
"cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
.. _pip_dependency:
......
......@@ -58,11 +58,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
:header: "version", "cp27-cp27mu", "cp27-cp27m"
:widths: 1, 3, 3
"cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
"cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
"cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
.. _pip_dependency:
......
......@@ -59,7 +59,7 @@
代码示例如下:
```python
from paddle.utils.merge_model import merge_v2_modelss
from paddle.utils.merge_model import merge_v2_model
from mnist_v2 import network
net = network(is_infer=True)
......
......@@ -13,4 +13,3 @@
# limitations under the License.
#
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
add_style_check_target(test_cclient test_cclient.c)
......@@ -11,7 +11,6 @@ GTAGS
*.pb.cc
*.pb.h
*_pb2.py
paddle_*
output/
google/
Makefile
......
......@@ -33,9 +33,6 @@ add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
${CAPI_PRIVATE_HEADER})
add_dependencies(paddle_capi paddle_proto paddle_gserver)
# TODO: paddle_capi_whole will be removed.
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
add_subdirectory(inference)
......@@ -89,7 +89,7 @@ cd Paddle
# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
nvidia-docker build -t paddle:float16 .
# After running this, different results will be written to different log files in Paddle/contrib/float16/
nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
```
#### Accuracy
......
......@@ -3,7 +3,7 @@
BUILD_PATH=/paddle/fp16_build
WHEEL_PATH=$BUILD_PATH/python/dist
INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
DEMO_PATH=/paddle/contrib/float16
DEMO_PATH=/paddle/paddle/contrib/float16
# Use the single most powerful CUDA GPU on your machine
export CUDA_VISIBLE_DEVICES=0
......@@ -50,7 +50,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
--repeat=$REPEAT \
......@@ -68,7 +67,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_resnet \
--data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
--repeat=$REPEAT \
......@@ -86,7 +84,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
--repeat=$REPEAT \
......@@ -104,7 +101,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
--repeat=$REPEAT \
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS})
list(APPEND arg_list "_${arg}")
endforeach()
else()
list(APPEND arg_list "_")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(${TARGET_NAME}
SRCS ${TEST_SRC}
DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
# set_tests_properties(${TARGET_NAME}
# PROPERTIES DEPENDS ${DEP_TEST})
endforeach()
endfunction(inference_api_test)
cc_library(paddle_inference_api
SRCS paddle_inference_api.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_impl
SRCS paddle_inference_api_impl.cc
DEPS paddle_inference_api paddle_fluid_api)
cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api)
inference_api_test(test_paddle_inference_api_impl
test_paddle_inference_api_impl.cc
test_word2vec)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h"
......@@ -12,49 +12,74 @@
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains the definition of a simple Inference API for Paddle.
*
* ATTENTION: It requires some C++ features, for lower version C++ or C, we
* might release another API.
*/
#pragma once
#include <memory>
#include <string>
#include <vector>
namespace paddle {
class Predictor {
public:
struct Attr;
Predictor() = default;
enum PaddleDType {
FLOAT32,
INT64,
};
// Build the network before inference.
bool Init(const Attr& attr);
struct PaddleBuf {
void* data; // pointer to the data memory.
size_t length; // number of memory bytes.
};
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
/*
* A simple Inference API for Paddle. Currently this API might just be used by
* non-sequence scenerios.
* TODO(Superjomn) Prepare another API for NLP-related usages.
*/
class PaddlePredictor {
public:
struct Config;
PaddlePredictor() = default;
PaddlePredictor(const PaddlePredictor&) = delete;
// Predict an record.
// Arguments:
// inputs: the name of the input variables.
// outputs: the name of the output varaibles.
// input_shapes: the shape of the input variables.
// output_shapes: the shape of the output variables.
// input_data: the data of the input variables.
// output_data: the data of the output variables.
bool Run(const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const std::vector<std::vector<int>>& input_shapes,
const std::vector<std::vector<int>>& output_shapes,
const std::vector<std::vector<float>>& input_data,
std::vector<std::vector<float>>* output_data);
// Clone a predictor that share the model weights.
Predictor* Clone();
// The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be alive until Run returns. caller should be
// responsible for releasing the memory of `output_data`.
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0;
// Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe.
virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
// Destroy the Predictor.
~Predictor();
virtual ~PaddlePredictor() {}
struct Attr {
friend std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
const PaddlePredictor::Config& config);
// The common configs for all the predictors.
struct Config {
enum class EngineKind;
std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on
// third-party engines.
EngineKind engine_kind{Attr::EngineKind::kNone};
EngineKind engine_kind{Config::EngineKind::kNone};
enum class EngineKind {
kNone = -1, // Use the native Fluid facility.
......@@ -66,4 +91,8 @@ public:
};
};
// A factory to help create difference predictor.
template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sys/time.h>
#include <algorithm>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
namespace paddle {
namespace {
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
template <class T>
std::string num2str(T a) {
std::stringstream istr;
istr << a;
return istr.str();
}
} // namespace
bool PaddlePredictorImpl::Init() {
VLOG(3) << "Predictor::init()";
// TODO(panyx0718): Should CPU vs GPU device be decided by id?
if (config_.device >= 0) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
}
paddle::framework::InitDevices(false);
executor_.reset(new paddle::framework::Executor(place_));
scope_.reset(new paddle::framework::Scope());
// Initialize the inference program
if (!config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << "fail to load inference model.";
return false;
}
ctx_ = executor_->Prepare(*inference_program_, 0);
// Create variables
// TODO(panyx0718): Why need to test share_variables here?
if (config_.share_variables) {
executor_->CreateVariables(*inference_program_, scope_.get(), 0);
}
// Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true;
}
bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
VLOG(3) << "Predictor::predict";
Timer timer;
timer.tic();
// set feed variable
std::map<std::string, const paddle::framework::LoDTensor *> feed_targets;
std::vector<paddle::framework::LoDTensor> feeds;
if (!SetFeed(inputs, &feeds)) {
LOG(ERROR) << "fail to set feed";
return false;
}
for (size_t i = 0; i < feed_target_names_.size(); ++i) {
feed_targets[feed_target_names_[i]] = &feeds[i];
}
// get fetch variable
std::map<std::string, paddle::framework::LoDTensor *> fetch_targets;
std::vector<paddle::framework::LoDTensor> fetchs;
fetchs.resize(fetch_target_names_.size());
for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
fetch_targets[fetch_target_names_[i]] = &fetchs[i];
}
// Run the inference program
// if share variables, we need not create variables
executor_->RunPreparedContext(ctx_.get(),
scope_.get(),
&feed_targets,
&fetch_targets,
!config_.share_variables);
if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs";
return false;
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
return true;
}
std::unique_ptr<PaddlePredictor> PaddlePredictorImpl::Clone() {
VLOG(3) << "Predictor::clone";
std::unique_ptr<PaddlePredictorImpl> cls(new PaddlePredictorImpl(config_));
if (!cls->InitShared(this)) {
LOG(ERROR) << "fail to call InitShared";
return nullptr;
}
return cls;
}
// TODO(panyx0718): Consider merge with Init()?
bool PaddlePredictorImpl::InitShared(PaddlePredictorImpl *cls) {
VLOG(3) << "Predictor::init_shared";
// 1. Define place, executor, scope
if (this->config_.device >= 0) {
place_ = paddle::platform::CUDAPlace();
} else {
place_ = paddle::platform::CPUPlace();
}
this->executor_.reset(new paddle::framework::Executor(this->place_));
this->scope_.reset(new paddle::framework::Scope());
// Initialize the inference program
if (!this->config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
this->inference_program_ = paddle::inference::Load(
this->executor_.get(), this->scope_.get(), this->config_.model_dir);
} else if (!this->config_.prog_file.empty() &&
!this->config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
this->inference_program_ =
paddle::inference::Load(this->executor_.get(),
this->scope_.get(),
this->config_.prog_file,
this->config_.param_file);
}
this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
// 3. create variables
// TODO(panyx0718): why test share_variables.
if (config_.share_variables) {
this->executor_->CreateVariables(
*this->inference_program_, this->scope_.get(), 0);
}
// 4. Get the feed_target_names and fetch_target_names
this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
return true;
}
bool PaddlePredictorImpl::SetFeed(
const std::vector<PaddleTensor> &inputs,
std::vector<paddle::framework::LoDTensor> *feeds) {
VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feed_target_names_.size()) {
LOG(ERROR) << "wrong feed input size.";
return false;
}
for (size_t i = 0; i < feed_target_names_.size(); ++i) {
paddle::framework::LoDTensor input;
paddle::framework::DDim ddim =
paddle::framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr =
input.mutable_data<int64_t>(ddim, paddle::platform::CPUPlace());
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, paddle::platform::CPUPlace());
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr),
inputs[i].data.data,
inputs[i].data.length);
feeds->push_back(input);
LOG(ERROR) << "Actual feed type " << feeds->back().type().name();
}
return true;
}
bool PaddlePredictorImpl::GetFetch(
const std::vector<paddle::framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *outputs) {
VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs.size());
for (size_t i = 0; i < fetchs.size(); ++i) {
// TODO(panyx0718): Support fetch of other types.
if (fetchs[i].type() != typeid(float)) {
LOG(ERROR) << "only support fetching float now.";
return false;
}
std::vector<int> shape;
auto dims_i = fetchs[i].dims();
auto lod = fetchs[i].lod();
const float *output_ptr = fetchs[i].data<float>();
// const int64_t* output_ptr = fetchs[i].data<int64_t>();
auto num = fetchs[i].numel();
std::vector<float> data;
if (0 == lod.size()) {
std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
for (int j = 0; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
} else {
// for batch detection
// image[0] -> output[0] shape {145, 6}
// image[1] -> output[1] shape {176, 6}
// then,
// the batch output shape {321, 6}
// the lod {{0, 145, 321}}
// so we should append output[0] to {176, 6}
size_t max_dim = 0;
for (size_t j = 1; j < lod[0].size(); j++) {
max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
}
size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
if (max_dim > 0) {
data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
}
for (size_t j = 1; j < lod[0].size(); j++) {
size_t start = lod[0][j - 1] * common_dim;
size_t end = lod[0][j] * common_dim;
if (end > start) {
std::copy(output_ptr + start,
output_ptr + end,
data.begin() + (j - 1) * max_dim * common_dim);
}
}
shape.push_back(lod[0].size() - 1);
shape.push_back(max_dim);
for (int j = 1; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
}
outputs->at(i).shape = shape;
outputs->at(i).data.length = sizeof(float) * data.size();
outputs->at(i).data.data = malloc(outputs->at(i).data.length);
std::memcpy(
outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
outputs->at(i).dtype = PaddleDType::FLOAT32;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
}
return true;
}
std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
const VisConfig &config) {
VLOG(3) << "create PaddlePredictorImpl";
// 1. GPU memeroy
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
num2str<float>(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
}
std::unique_ptr<PaddlePredictorImpl> predictor(
new PaddlePredictorImpl(config));
if (!predictor->Init()) {
return nullptr;
}
return predictor;
}
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
struct VisConfig : public PaddlePredictor::Config {
int device;
float fraction_of_gpu_memory;
std::string prog_file;
std::string param_file;
bool share_variables;
};
/*
* Do not use this, just a demo indicating how to customize a Predictor.
*/
class PaddlePredictorImpl : public PaddlePredictor {
public:
explicit PaddlePredictorImpl(const VisConfig &config) : config_(config) {}
bool Init();
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override;
std::unique_ptr<PaddlePredictor> Clone() override;
~PaddlePredictorImpl() override{};
private:
bool InitShared(PaddlePredictorImpl *cls);
bool SetFeed(const std::vector<PaddleTensor> &input_datas,
std::vector<paddle::framework::LoDTensor> *feeds);
bool GetFetch(const std::vector<paddle::framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *output_data);
VisConfig config_;
paddle::platform::Place place_;
std::unique_ptr<paddle::framework::Executor> executor_;
std::unique_ptr<paddle::framework::Scope> scope_;
std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx_;
std::unique_ptr<paddle::framework::ProgramDesc> inference_program_;
std::vector<std::string> feed_target_names_;
std::vector<std::string> fetch_target_names_;
};
std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
const VisConfig &config);
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
namespace paddle {
/*
* Do not use this, just a demo indicating how to customize a config for a
* specific predictor.
*/
struct DemoConfig : public PaddlePredictor::Config {
float other_config;
};
/*
* Do not use this, just a demo indicating how to customize a Predictor.
*/
class DemoPredictor : public PaddlePredictor {
public:
explicit DemoPredictor(const DemoConfig &config) {
LOG(INFO) << "I get other_config " << config.other_config;
}
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override {
LOG(INFO) << "Run";
return false;
}
std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
~DemoPredictor() override {}
};
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
const DemoConfig &config) {
std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
return x;
}
TEST(paddle_inference_api, demo) {
DemoConfig config;
config.other_config = 1.7;
auto predictor = CreatePaddlePredictor(config);
std::vector<PaddleTensor> outputs;
predictor->Run({}, &outputs);
}
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
namespace paddle {
PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
PaddleTensor pt;
pt.data.data = t->data<void>();
if (t->type() == typeid(int64_t)) {
pt.data.length = t->numel() * sizeof(int64_t);
pt.dtype = PaddleDType::INT64;
} else if (t->type() == typeid(float)) {
pt.data.length = t->numel() * sizeof(float);
pt.dtype = PaddleDType::FLOAT32;
} else {
LOG(FATAL) << "unsupported type.";
}
pt.shape = framework::vectorize2int(t->dims());
return pt;
}
TEST(paddle_inference_api_impl, word2vec) {
VisConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.85;
config.device = 0;
config.share_variables = true;
std::unique_ptr<PaddlePredictorImpl> predictor =
CreatePaddlePredictorImpl(config);
framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}};
int64_t dict_size = 2073; // The size of dictionary
SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
std::vector<PaddleTensor> cpu_feeds;
cpu_feeds.push_back(LodTensorToPaddleTensor(&first_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&second_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&third_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(cpu_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1);
for (size_t i = 0; i < outputs.size(); ++i) {
size_t len = outputs[i].data.length;
float* data = static_cast<float*>(outputs[i].data.data);
for (int j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0);
}
free(outputs[i].data.data);
}
}
} // namespace paddle
......@@ -87,8 +87,3 @@ else()
endif()
add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
add_style_check_target(paddle_cuda
${CUDA_SOURCES}
${CUDA_HEADERS}
${CUDA_CXX_SOURCES})
......@@ -36,5 +36,5 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context gather_op_handle)
cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context reduce_op_handle )
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
# device_context reduce_op_handle )
......@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/send_op_handle.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
......@@ -159,26 +160,40 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
if (!is_forwarding && places_.size() > 1) {
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
for (auto &og : op->OutputArgumentNames()) {
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
if (static_cast<bool>(boost::get<int>(op->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward))) {
try {
auto backward_vars =
boost::get<std::vector<std::string>>(op->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1];
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce:
CreateReduceOp(&result, og, cur_device_id);
var_name_on_devices[cur_device_id].emplace(og);
bcast_var_name_set[cur_device_id].emplace(
og.substr(0, og.size() - strlen(kGradVarSuffix)));
CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices[cur_device_id].emplace(g_name);
bcast_var_name_set[cur_device_id].emplace(p_name);
cur_device_id = (cur_device_id + 1) % places_.size();
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(var_types, og)) {
CreateReduceOp(&result, og, 0);
CreateBroadcastOp(&result, og, 0);
if (IsSparseGradient(var_types, g_name)) {
CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0);
} else {
InsertNCCLAllReduceOp(&result, og);
InsertNCCLAllReduceOp(&result, g_name);
}
break;
}
}
} catch (boost::bad_get e) {
}
}
}
}
......@@ -398,11 +413,12 @@ void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
// FIXME(yy): Do not hard code like this
return op.OutputArgumentNames().size() == 1 &&
op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
return boost::get<int>(
op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
(static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss)) &&
!loss_var_name_.empty(); // If loss_var is empty. This is test mode
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -96,10 +96,7 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
info->proto_ = new proto::OpProto;
info->checker_ = new OpAttrChecker();
T maker;
maker.SetProto(info->proto_);
maker.SetChecker(info->checker_);
maker.Make();
maker.Validate();
maker(info->proto_, info->checker_);
info->proto_->set_type(op_type);
PADDLE_ENFORCE(
info->proto_->IsInitialized(),
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <unordered_map>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/shape_inference.h"
......@@ -222,6 +223,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
return it->second;
}
Attribute OpDesc::GetNullableAttr(const std::string &name) const {
auto it = attrs_.find(name);
if (it != attrs_.end()) {
return it->second;
} else {
return Attribute();
}
}
int OpDesc::GetBlockAttr(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
......@@ -233,13 +243,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
}
void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
for (auto &output : outputs_) {
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
RenameInput(old_name, new_name);
RenameOutput(old_name, new_name);
need_update_ = true;
}
......@@ -249,6 +254,13 @@ void OpDesc::RenameOutput(const std::string &old_name,
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
if (it != attrs_.end()) {
auto &op_vars = boost::get<std::vector<std::string>>(it->second);
std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
}
need_update_ = true;
}
......@@ -257,6 +269,13 @@ void OpDesc::RenameInput(const std::string &old_name,
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
if (it != attrs_.end()) {
auto &op_vars = boost::get<std::vector<std::string>>(it->second);
std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
}
need_update_ = true;
}
......
......@@ -78,6 +78,8 @@ class OpDesc {
Attribute GetAttr(const std::string &name) const;
Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttr(const std::string &name) const;
void Rename(const std::string &old_name, const std::string &new_name);
......
......@@ -13,6 +13,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_proto_maker.h"
#include <string>
#include <vector>
namespace paddle {
namespace framework {
......@@ -55,5 +56,28 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
}
}
void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
OpAttrChecker* attr_checker) {
proto_ = proto;
op_checker_ = attr_checker;
Make();
AddAttr<int>(OpRoleAttrName(), "The role of this operator")
.InEnum(
{static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kBackward),
static_cast<int>(OpRole::kOptimize),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kLoss) |
static_cast<int>(OpRole::kBackward),
static_cast<int>(OpRole::kNotSpecified)})
.SetDefault(static_cast<int>(OpRole::kNotSpecified));
AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
"Optimized for variable")
.SetDefault({});
Validate();
}
} // namespace framework
} // namespace paddle
......@@ -20,21 +20,31 @@ limitations under the License. */
namespace paddle {
namespace framework {
enum class OpRole {
kForward = 0x0000,
kBackward = 0x0001,
kOptimize = 0x0002,
kLoss = 0x0100,
// The default value of op's role. This should be only used for unittests and
// CreateOp inside a operator.
kNotSpecified = 0x1000,
};
// this class not only make proto but also init attribute checkers.
class OpProtoAndCheckerMaker {
public:
static const char *OpRoleAttrName() { return "op_role"; }
static const char *OpRoleVarAttrName() { return "op_role_var"; }
void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
virtual void Make() = 0;
virtual ~OpProtoAndCheckerMaker() {
CHECK(validated_) << "should call Validate after build";
}
void SetProto(proto::OpProto *proto) { proto_ = proto; }
void SetChecker(OpAttrChecker *attr_checker) { op_checker_ = attr_checker; }
void Validate();
protected:
struct VariableBuilder {
proto::OpProto::Var *var_;
......@@ -76,6 +86,7 @@ class OpProtoAndCheckerMaker {
private:
void CheckNoDuplicatedInOutAttrs();
void Validate();
proto::OpProto *proto_;
OpAttrChecker *op_checker_;
......
......@@ -28,10 +28,8 @@ TEST(ProtoMaker, DuplicatedAttr) {
paddle::framework::proto::OpProto op_proto;
paddle::framework::OpAttrChecker op_checker;
TestAttrProtoMaker proto_maker;
proto_maker.SetProto(&op_proto);
proto_maker.SetChecker(&op_checker);
proto_maker.Make();
ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
ASSERT_THROW(proto_maker(&op_proto, &op_checker),
paddle::platform::EnforceNotMet);
}
class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
......@@ -46,8 +44,6 @@ TEST(ProtoMaker, DuplicatedInOut) {
paddle::framework::proto::OpProto op_proto;
paddle::framework::OpAttrChecker op_checker;
TestAttrProtoMaker proto_maker;
proto_maker.SetProto(&op_proto);
proto_maker.SetChecker(&op_checker);
proto_maker.Make();
ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
ASSERT_THROW(proto_maker(&op_proto, &op_checker),
paddle::platform::EnforceNotMet);
}
......@@ -63,6 +63,7 @@ class InferShapeContext {
std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
......@@ -81,8 +82,6 @@ class InferShapeContext {
const std::vector<std::string> &names) const;
virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
};
} // namespace framework
......
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api
SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include <vector>
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/graph_traits.h"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/node.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <gflags/gflags.h>
......@@ -29,11 +29,10 @@ DEFINE_string(inference_model_dir, "", "inference test model dir");
static framework::proto::ProgramDesc LoadProgramDesc(
const std::string& model_dir = FLAGS_inference_model_dir) {
// TODO(Superjomn) update latter.
auto place = paddle::platform::CPUPlace();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
auto program = Load(&executor, scope, model_dir);
paddle::platform::CPUPlace place;
paddle::framework::Executor executor(place);
paddle::framework::Scope scope;
auto program = Load(&executor, &scope, model_dir);
return *program->Proto();
}
......
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
SERIAL)
# This test is not stable
# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828
#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
# SERIAL)
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
......@@ -201,9 +201,9 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
listen_and_serv_op sum_op executor SERIAL)
#set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
# listen_and_serv_op sum_op executor SERIAL)
if(WITH_GPU)
set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
......@@ -196,9 +197,14 @@ bool RPCClient::Wait() {
const size_t kReqCnt = req_count_;
bool a[kReqCnt];
std::vector<std::future<void>> waits(req_count_);
std::mutex mu;
for (int i = 0; i < req_count_; i++) {
waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
waits[i] = framework::AsyncIO([i, &a, &mu, this] {
bool ret = Proceed();
std::lock_guard<std::mutex> l(mu);
a[i] = ret;
});
}
for (int i = 0; i < req_count_; i++) {
......
......@@ -19,10 +19,16 @@ limitations under the License. */
using ::grpc::ServerAsyncResponseWriter;
DEFINE_int32(rpc_server_handle_send_threads, 20,
"Number of threads used to handle send at rpc server.");
DEFINE_int32(rpc_server_handle_get_threads, 20,
"Number of threads used to handle get at rpc server.");
DEFINE_int32(rpc_server_handle_prefetch_threads, 1,
"Number of threads used to handle prefetch at rpc server.");
namespace paddle {
namespace operators {
namespace detail {
enum CallStatus { PROCESS = 0, FINISH };
// reference:
......@@ -63,18 +69,20 @@ class RequestSend final : public RequestBase {
explicit RequestSend(GrpcService::AsyncService* service,
::grpc::ServerCompletionQueue* cq, bool sync_mode,
framework::Scope* scope, ReceivedQueue* queue,
const platform::DeviceContext* dev_ctx)
const platform::DeviceContext* dev_ctx, int req_id)
: RequestBase(service, cq, sync_mode, dev_ctx),
queue_(queue),
responder_(&ctx_) {
responder_(&ctx_),
req_id_(req_id) {
if (sync_mode_) {
request_.reset(new VariableResponse(scope, dev_ctx_, false));
} else {
request_.reset(new VariableResponse(scope, dev_ctx_, true));
}
int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
cq_, cq_, this);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
}
virtual ~RequestSend() {}
......@@ -86,15 +94,17 @@ class RequestSend final : public RequestBase {
VLOG(3) << "RequestSend " << var_name;
queue_->Push(std::make_pair(var_name, request_));
sendrecv::VoidMessage reply;
responder_.Finish(reply, ::grpc::Status::OK, this);
status_ = FINISH;
responder_.Finish(reply_, ::grpc::Status::OK,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
}
protected:
sendrecv::VoidMessage reply_;
std::shared_ptr<VariableResponse> request_;
ReceivedQueue* queue_;
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
int req_id_;
};
class RequestGet final : public RequestBase {
......@@ -103,14 +113,17 @@ class RequestGet final : public RequestBase {
::grpc::ServerCompletionQueue* cq, bool sync_mode,
framework::Scope* scope,
const platform::DeviceContext* dev_ctx,
framework::BlockingQueue<MessageWithName>* queue)
framework::BlockingQueue<MessageWithName>* queue,
int req_id)
: RequestBase(service, cq, sync_mode, dev_ctx),
responder_(&ctx_),
scope_(scope),
queue_(queue) {
queue_(queue),
req_id_(req_id) {
auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
cq_, this);
service_->RequestAsyncUnary(
method_id, &ctx_, &request_, &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
}
virtual ~RequestGet() {}
......@@ -123,13 +136,13 @@ class RequestGet final : public RequestBase {
VLOG(3) << "RequestGet " << var_name;
auto* var = scope_->FindVar(var_name);
::grpc::ByteBuffer reply;
if (var_name != FETCH_BARRIER_MESSAGE) {
SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
}
responder_.Finish(reply, ::grpc::Status::OK, this);
status_ = FINISH;
responder_.Finish(reply_, ::grpc::Status::OK,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
if (var_name == FETCH_BARRIER_MESSAGE) {
sendrecv::VariableMessage msg;
......@@ -140,9 +153,11 @@ class RequestGet final : public RequestBase {
protected:
sendrecv::VariableMessage request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
framework::Scope* scope_;
framework::BlockingQueue<MessageWithName>* queue_;
int req_id_;
};
class RequestPrefetch final : public RequestBase {
......@@ -153,21 +168,24 @@ class RequestPrefetch final : public RequestBase {
const platform::DeviceContext* dev_ctx,
framework::Executor* executor,
framework::ProgramDesc* program,
framework::ExecutorPrepareContext* prefetch_ctx)
framework::ExecutorPrepareContext* prefetch_ctx,
int req_id)
: RequestBase(service, cq, sync_mode, dev_ctx),
responder_(&ctx_),
scope_(scope),
executor_(executor),
program_(program),
prefetch_ctx_(prefetch_ctx) {
prefetch_ctx_(prefetch_ctx),
req_id_(req_id) {
if (sync_mode_) {
request_.reset(new VariableResponse(scope, dev_ctx_, false));
} else {
request_.reset(new VariableResponse(scope, dev_ctx_, true));
}
int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
cq_, cq_, this);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
}
virtual ~RequestPrefetch() {}
......@@ -176,7 +194,6 @@ class RequestPrefetch final : public RequestBase {
virtual void Process() {
// prefetch process...
::grpc::ByteBuffer reply;
std::string var_name = request_->OutVarname();
VLOG(3) << "RequestPrefetch " << var_name;
......@@ -186,19 +203,22 @@ class RequestPrefetch final : public RequestBase {
InitializeVariable(var, var_desc->GetType());
executor_->RunPreparedContext(prefetch_ctx_, scope_);
SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
responder_.Finish(reply, ::grpc::Status::OK, this);
status_ = FINISH;
responder_.Finish(reply_, ::grpc::Status::OK,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
}
protected:
std::shared_ptr<VariableResponse> request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
framework::Scope* scope_;
framework::Executor* executor_;
framework::ProgramDesc* program_;
framework::ExecutorPrepareContext* prefetch_ctx_;
int req_id_;
};
void AsyncGRPCServer::WaitClientGet(int count) {
......@@ -232,24 +252,39 @@ void AsyncGRPCServer::RunSyncUpdate() {
LOG(INFO) << "Server listening on " << address_
<< " selected port: " << selected_port_;
std::function<void()> send_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
std::function<void()> get_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
std::function<void()> prefetch_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
std::function<void(int)> send_register = std::bind(
&AsyncGRPCServer::TryToRegisterNewSendOne, this, std::placeholders::_1);
std::function<void(int)> get_register = std::bind(
&AsyncGRPCServer::TryToRegisterNewGetOne, this, std::placeholders::_1);
std::function<void(int)> prefetch_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this,
std::placeholders::_1);
// TODO(wuyi): Run these "HandleRequest" in thread pool
t_send_.reset(
for (int i = 0; i < kSendReqsBufSize; ++i) {
TryToRegisterNewSendOne(i);
}
for (int i = 0; i < kGetReqsBufSize; ++i) {
TryToRegisterNewGetOne(i);
}
for (int i = 0; i < kPrefetchReqsBufSize; ++i) {
TryToRegisterNewPrefetchOne(i);
}
for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
t_sends_.emplace_back(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_send_.get(), "cq_send", send_register)));
t_get_.reset(
}
for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
t_gets_.emplace_back(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_get_.get(), "cq_get", get_register)));
t_prefetch_.reset(new std::thread(
}
for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
t_prefetchs_.emplace_back(new std::thread(
std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
"cq_prefetch", prefetch_register)));
}
{
std::lock_guard<std::mutex> lock(this->mutex_ready_);
ready_ = 1;
......@@ -257,9 +292,15 @@ void AsyncGRPCServer::RunSyncUpdate() {
condition_ready_.notify_all();
// wait server
server_->Wait();
t_send_->join();
t_get_->join();
t_prefetch_->join();
for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
t_sends_[i]->join();
}
for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
t_gets_[i]->join();
}
for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
t_prefetchs_[i]->join();
}
}
void AsyncGRPCServer::ShutdownQueue() {
......@@ -276,47 +317,48 @@ void AsyncGRPCServer::ShutDown() {
server_->Shutdown();
}
void AsyncGRPCServer::TryToRegisterNewSendOne() {
void AsyncGRPCServer::TryToRegisterNewSendOne(int i) {
std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) {
VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
return;
}
RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
scope_, &var_recv_queue_, dev_ctx_);
scope_, &var_recv_queue_, dev_ctx_, i);
send_reqs_[i] = static_cast<RequestBase*>(send);
VLOG(4) << "Create RequestSend status:" << send->Status();
}
void AsyncGRPCServer::TryToRegisterNewGetOne() {
void AsyncGRPCServer::TryToRegisterNewGetOne(int req_id) {
std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) {
VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
return;
}
RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
dev_ctx_, &var_get_queue_);
dev_ctx_, &var_get_queue_, req_id);
get_reqs_[req_id] = static_cast<RequestBase*>(get);
VLOG(4) << "Create RequestGet status:" << get->Status();
}
void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
void AsyncGRPCServer::TryToRegisterNewPrefetchOne(int req_id) {
std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) {
VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
return;
}
RequestPrefetch* prefetch =
new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
dev_ctx_, executor_, program_, prefetch_ctx_.get());
RequestPrefetch* prefetch = new RequestPrefetch(
&service_, cq_prefetch_.get(), sync_mode_, scope_, dev_ctx_, executor_,
program_, prefetch_ctx_.get(), req_id);
prefetch_reqs_[req_id] = static_cast<RequestBase*>(prefetch);
VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
}
// FIXME(typhoonzero): change cq_name to enum.
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
const std::string& cq_name,
std::function<void()> TryToRegisterNewOne) {
TryToRegisterNewOne();
void AsyncGRPCServer::HandleRequest(
::grpc::ServerCompletionQueue* cq, const std::string& cq_name,
std::function<void(int)> TryToRegisterNewOne) {
void* tag = NULL;
bool ok = false;
......@@ -327,8 +369,7 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
break;
}
VLOG(3) << "HandleRequest for " << cq_name << " get Next";
PADDLE_ENFORCE(tag);
int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
if (sync_mode_) {
// FIXME(typhoonzero): de-couple the barriers with recv_op
......@@ -337,7 +378,17 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
}
RequestBase* base = reinterpret_cast<RequestBase*>(tag);
RequestBase* base = nullptr;
{
std::lock_guard<std::mutex> l(cq_mutex_);
if (cq_name == "cq_get") {
base = get_reqs_[req_id];
} else if (cq_name == "cq_send") {
base = send_reqs_[req_id];
} else if (cq_name == "cq_prefetch") {
base = prefetch_reqs_[req_id];
}
}
// reference:
// https://github.com/tensorflow/tensorflow/issues/5596
// https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
......@@ -345,19 +396,19 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
if (!ok) {
LOG(WARNING) << cq_name << " recv no regular event:argument name["
<< base->GetReqName() << "]";
TryToRegisterNewOne();
TryToRegisterNewOne(req_id);
delete base;
continue;
}
switch (base->Status()) {
case PROCESS: {
TryToRegisterNewOne();
base->Process();
VLOG(4) << cq_name << " PROCESS status:" << base->Status();
break;
}
case FINISH: {
TryToRegisterNewOne(req_id);
VLOG(4) << cq_name << " FINISH status:" << base->Status();
delete base;
break;
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <thread> // NOLINT
#include <utility>
#include <vector>
#include "grpc++/grpc++.h"
#include "paddle/fluid/framework/blocking_queue.h"
......@@ -30,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
......@@ -82,19 +84,27 @@ class AsyncGRPCServer final {
protected:
void HandleRequest(::grpc::ServerCompletionQueue *cq,
const std::string &cq_name,
std::function<void()> TryToRegisterNewOne);
void TryToRegisterNewSendOne();
void TryToRegisterNewGetOne();
void TryToRegisterNewPrefetchOne();
std::function<void(int)> TryToRegisterNewOne);
void TryToRegisterNewSendOne(int req_id);
void TryToRegisterNewGetOne(int req_id);
void TryToRegisterNewPrefetchOne(int req_id);
void ShutdownQueue();
private:
static const int kSendReqsBufSize = 100;
static const int kGetReqsBufSize = 100;
static const int kPrefetchReqsBufSize = 10;
std::mutex cq_mutex_;
volatile bool is_shut_down_ = false;
std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
RequestBase *send_reqs_[kSendReqsBufSize];
RequestBase *get_reqs_[kGetReqsBufSize];
RequestBase *prefetch_reqs_[kPrefetchReqsBufSize];
GrpcService::AsyncService service_;
std::unique_ptr<::grpc::Server> server_;
......@@ -113,8 +123,10 @@ class AsyncGRPCServer final {
mutable int barrier_cond_step_;
std::condition_variable barrier_condition_;
std::unique_ptr<std::thread> t_send_;
std::unique_ptr<std::thread> t_get_;
std::vector<std::unique_ptr<std::thread>> t_sends_;
std::vector<std::unique_ptr<std::thread>> t_gets_;
std::vector<std::unique_ptr<std::thread>> t_prefetchs_;
std::unique_ptr<std::thread> t_prefetch_;
std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
......
......@@ -108,7 +108,7 @@ void StartServer(const std::string& endpoint) {
rpc_service_->RunSyncUpdate();
}
TEST(PREFETCH, CPU) {
TEST(PREFETCH, DISABLED_CPU) {
// start up a server instance backend
std::thread server_thread(StartServer, "127.0.0.1:8889");
sleep(2);
......
......@@ -25,6 +25,8 @@
#include <grpc++/support/byte_buffer.h>
#include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/platform/profiler.h"
// NOTE: This method was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// method and did some modifications so that we can parse gRPC
......
......@@ -70,10 +70,10 @@ message VariableMessage {
bytes rows = 9;
// Look up table block execution output variable name.
string out_varname = 10;
// If true, the ps server will start profiling, the ps
// If 1, the ps server will start profiling, the ps
// server stops profiling and generates a profile to /tmp/profile_ps_*
// when profile switches from true to false.
bool profile = 11;
// when profile switches from 1 to 2.
int64 profile = 11;
}
message VoidMessage {}
......@@ -123,7 +123,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the
// trainer.
request.set_profile(platform::IsProfileEnabled());
if (platform::ShouldSendProfileState()) {
if (platform::IsProfileEnabled()) {
request.set_profile(platform::kEnableProfiler);
} else {
request.set_profile(platform::kDisableProfiler);
}
}
if (!out_name.empty()) {
request.set_out_varname(out_name);
}
......@@ -143,12 +149,14 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
// GPU data is copied to CPU buffer when sending,
// free the buffer when possible.
destroy_callback = [](void* backing) {
platform::CUDAPinnedPlace cuda_pinned;
memory::Free(cuda_pinned, backing);
};
#endif
}
std::string header;
......
......@@ -449,8 +449,8 @@ int VariableResponse::Parse(Source* source) {
break;
}
case sendrecv::VariableMessage::kProfileFieldNumber: {
bool profiling;
if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
uint64_t profiling = 0;
if (!input.ReadVarint64(&profiling)) {
return tag;
}
meta_.set_profile(profiling);
......@@ -458,9 +458,11 @@ int VariableResponse::Parse(Source* source) {
if (listener_id <= 0) {
break;
}
if (profiling && !platform::IsProfileEnabled()) {
if (profiling == platform::kEnableProfiler &&
!platform::IsProfileEnabled()) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (!profiling && platform::IsProfileEnabled()) {
} else if (profiling == platform::kDisableProfiler &&
platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler(
platform::EventSortingKey::kDefault,
......
......@@ -24,6 +24,8 @@ detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
detection_library(target_assign_op SRCS target_assign_op.cc
target_assign_op.cu)
detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu)
# Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto* in = ctx.Input<Tensor>("Input");
auto in_dims = in->dims();
const T* in_data = in->data<T>();
auto* out = ctx.Output<Tensor>("Output");
T* out_data = out->mutable_data<T>(ctx.GetPlace());
int batch_size = in_dims[0];
int geo_channel = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int id = 0;
for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
for (int id_h = 0; id_h < height; ++id_h) {
for (int id_w = 0; id_w < width; ++id_w) {
id = id_n * height * width + width * id_h + id_w;
if (id_n % 2 == 0) {
out_data[id] = id_w - in_data[id];
} else {
out_data[id] = id_h - in_data[id];
}
}
}
}
}
};
class PolygonBoxTransformOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("Input"),
"Input (Input) of polygon_box transform op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Output"),
"Output (Output) of polygon_box transform op should not be null.");
auto in_dim = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
"input's second dimension must be even.");
ctx->SetOutputDim("Output", in_dim);
}
};
class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"Input",
"The input with shape [batch_size, geometry_channels, height, width]");
AddOutput("Output", "The output with the same shape as input");
AddComment(R"DOC(
PolygonBoxTransform Operator.
The input is the final geometry output in detection network.
We use 2*n numbers to denote the coordinate shift from n corner vertices of
the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
the geometry output contains 2*n channels.
PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
ops::PolygonBoxTransformOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
polygon_box_transform,
ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_BLOCK_SIZE 16
template <typename T>
__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
const T* input, T* output) {
int id_n = threadIdx.x + blockDim.x * blockIdx.x;
int id_h = threadIdx.y + blockDim.y * blockIdx.y;
int id_w = threadIdx.z + blockDim.z * blockIdx.z;
if (id_n < n && id_h < h && id_w < w) {
int id = id_n * h * w + w * id_h + id_w;
if (id_n % 2 == 0) {
output[id] = id_w - input[id];
} else {
output[id] = id_h - input[id];
}
}
}
template <typename T>
class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto* in = ctx.Input<Tensor>("Input");
auto in_dims = in->dims();
const T* in_data = in->data<T>();
auto* out = ctx.Output<Tensor>("Output");
T* out_data = out->mutable_data<T>(ctx.GetPlace());
int batch_size = in_dims[0];
int geo_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
dim3 threadsPerBlock(
PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
(height + threadsPerBlock.y - 1) / threadsPerBlock.y,
(width + threadsPerBlock.z - 1) / threadsPerBlock.z);
auto stream = ctx.cuda_device_context().stream();
PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
batch_size * geo_channels, height, width, in_data, out_data);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
polygon_box_transform,
paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
......@@ -24,19 +26,57 @@ struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
template <typename DeviceContext, typename T>
void default_elementwise_add(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
AddFunctor<T>(), z);
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto blas = math::GetBlas<DeviceContext, T>(ctx);
blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data());
}
template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value ||
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
}
template <typename DeviceContext, typename T>
class ElementwiseAddKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
const auto x = ctx.Input<Tensor>("X");
const auto y = ctx.Input<Tensor>("Y");
auto z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
AddFunctor<T>(), z);
auto dims_equal = x->dims() == y->dims();
if (dims_equal) {
elementwise_add<DeviceContext, T>(ctx, x, y, z);
} else {
default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
}
}
};
......@@ -45,6 +85,55 @@ struct IdentityGrad {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename DeviceContext, typename T>
void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout,
framework::Tensor* dx,
framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
IdentityGrad<T>());
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
auto blas = math::GetBlas<DeviceContext, T>(ctx);
if (dx) {
blas.VCOPY(dout->numel(), dout->data<T>(),
dx->mutable_data<T>(ctx.GetPlace()));
}
if (dy) {
blas.VCOPY(dout->numel(), dout->data<T>(),
dy->mutable_data<T>(ctx.GetPlace()));
}
}
template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value ||
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
}
template <typename DeviceContext, typename T>
class ElementwiseAddGradKernel : public framework::OpKernel<T> {
public:
......@@ -57,10 +146,13 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
IdentityGrad<T>());
if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} else {
default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
dy);
}
}
};
......
......@@ -46,9 +46,11 @@ class ElementwiseOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto x_var = op_desc.Input("X")[0];
auto out_var = op_desc.Output("Out")[0];
block->Var(out_var)->SetType(block->Var(x_var)->GetType());
auto x_name = op_desc.Input("X")[0];
auto out_name = op_desc.Output("Out")[0];
auto& x = block->FindRecursiveOrCreateVar(x_name);
auto& out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(x.GetType());
}
};
......
......@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Output<framework::Tensor>("Out");
auto* in = ctx.Input<framework::LoDTensor>("Input");
if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
// set the correct batch size for the LoDTensor.
auto odims = out->dims();
int output_dim_idx = ctx.Attr<int>("output_dim_idx");
odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
out->mutable_data<T>(odims, ctx.GetPlace());
}
out->mutable_data<T>(ctx.GetPlace());
auto value = ctx.Attr<float>("value");
......
......@@ -125,6 +125,12 @@ class Blas {
template <typename T>
void AXPY(int n, T alpha, const T* x, T* y) const;
template <typename T>
void VADD(int n, const T* x, const T* y, T* z) const;
template <typename T>
void VCOPY(int n, const T* x, T* y) const;
template <typename T>
void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
T* C) const;
......@@ -163,6 +169,16 @@ class BlasT : private Blas<DeviceContext> {
Base()->template AXPY<T>(args...);
}
template <typename... ARGS>
void VADD(ARGS... args) const {
Base()->template VADD<T>(args...);
}
template <typename... ARGS>
void VCOPY(ARGS... args) const {
Base()->template VCOPY<T>(args...);
}
template <typename... ARGS>
void GEMV(ARGS... args) const {
Base()->template GEMV<T>(args...);
......
......@@ -34,6 +34,18 @@ struct CBlas<float> {
cblas_saxpy(args...);
}
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS>
static void VADD(ARGS... args) {
vsAdd(args...);
}
#endif
template <typename... ARGS>
static void VCOPY(ARGS... args) {
cblas_scopy(args...);
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
cblas_sgemv(args...);
......@@ -59,6 +71,18 @@ struct CBlas<double> {
cblas_daxpy(args...);
}
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS>
static void VADD(ARGS... args) {
vdAdd(args...);
}
#endif
template <typename... ARGS>
static void VCOPY(ARGS... args) {
cblas_dcopy(args...);
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
cblas_dgemv(args...);
......@@ -139,6 +163,24 @@ void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VCOPY(int n, const T *x, T *y) const {
CBlas<T>::VCOPY(n, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VADD(n, x, y, z);
#else
this->template VCOPY<T>(n, y, z);
this->template AXPY<T>(n, 1., x, z);
#endif
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
......
......@@ -23,6 +23,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
# Export local libraries to parent
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class CustomReader : public framework::DecoratedReader {
public:
CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
const platform::Place& dev_place,
const std::vector<std::string>& source_var_names,
const std::vector<std::string>& sink_var_names)
: DecoratedReader(reader),
program_(*sub_block.Program()),
sub_block_id_(sub_block.ID()),
exe_(framework::Executor(dev_place)),
source_var_names_(source_var_names),
sink_var_names_(sink_var_names) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override;
private:
const framework::ProgramDesc program_;
int sub_block_id_;
framework::Executor exe_;
std::vector<std::string> source_var_names_;
std::vector<std::string> sink_var_names_;
};
class CreateCustomReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
out->Reset(
new CustomReader(underlying_reader.Get(), *sub_block, dev_place,
Attr<std::vector<std::string>>("source_var_names"),
Attr<std::vector<std::string>>("sink_var_names")));
}
};
class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
protected:
void Apply() override {
AddAttr<framework::BlockDesc*>(
"sub_block", "The block to hold all preprocessing operators.");
AddAttr<std::vector<std::string>>(
"source_var_names",
"Source variables are starting points of data preprocessing. They hold "
"preprocessing's input tensors. Each source variable corresponds to "
"one of underlying reader's output datas.");
AddAttr<std::vector<std::string>>(
"sink_var_names",
"Sink variables are ending points of data preprocessing. They hold "
"preprocessing's output tensors. Each sink variable corresponds to "
"one of custom reader's output datas.");
AddComment(R"DOC(
CreateCustomReader Operator
A custom reader can be used for input data preprocessing.
A custom reader holds its own sub-block, which will be executed in its
'ReadNext()' function. Users can configurate their own preprocessing
pipelines by inserting operators into custom reader's sub-block.
)DOC");
}
};
class CustomReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(!ctx->IsRuntime(),
"'CustomReaderInferShape' should only be invoked during "
"compile time.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output decorated reader should not be null.");
const auto* sub_block =
ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
const auto sink_var_names =
ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
std::vector<std::vector<int64_t>> res_dims;
std::vector<int32_t> res_lod_levels;
for (const std::string& var_name : sink_var_names) {
auto* sink_var = sub_block->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(sink_var);
res_dims.emplace_back(sink_var->GetShape());
res_lod_levels.push_back(sink_var->GetLoDLevel());
}
auto* out_reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetShapes(res_dims);
out_reader->SetLoDLevels(res_lod_levels);
}
};
class CustomReaderInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
PADDLE_ENFORCE_NOT_NULL(out_reader);
out_reader->SetType(framework::proto::VarType::READER);
auto sink_var_names =
boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
const auto* sub_block =
boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
std::vector<framework::proto::VarType::Type> res_data_types;
for (const std::string& var_name : sink_var_names) {
framework::VarDesc* var = sub_block->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
res_data_types.emplace_back(var->GetDataType());
}
out_reader->SetDataTypes(res_data_types);
}
};
void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
out->clear();
std::vector<framework::LoDTensor> underlying_outs;
reader_->ReadNext(&underlying_outs);
if (underlying_outs.empty()) {
// There is not next data.
return;
}
PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
"The size of source_var_names(%d) and the size of "
"underlying_outs(%d) are not consistent. Each feeding element "
"must have its own source variable.",
source_var_names_.size(), underlying_outs.size());
// The scope for CustomReader's sub-block should be independent and shouldn't
// be any other computation scope's child. Otherwise, data preprocessing and
// compution cannot be concurrent.
framework::Scope scope;
// 1. Copy LoDTensors from underlying reader's output to source variables.
for (size_t i = 0; i < source_var_names_.size(); ++i) {
framework::Variable* var = scope.Var(source_var_names_[i]);
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
tensor->ShareDataWith(underlying_outs[i]);
tensor->set_lod(underlying_outs[i].lod());
}
// 2. Run the sub-block.
exe_.Run(program_, &scope, sub_block_id_, false, true);
// 3. Copy LoDTensors from sink variables to out.
out->resize(sink_var_names_.size());
for (size_t i = 0; i < sink_var_names_.size(); ++i) {
const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
.Get<framework::LoDTensor>();
framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
}
}
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
ops::CustomReaderInferVarType,
paddle::framework::EmptyGradOpMaker)
......@@ -115,6 +115,7 @@ void DecoratedReaderInferShape::operator()(
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetLoDLevels(in_reader->GetLoDLevels());
}
void DecoratedReaderInferVarType::operator()(
const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
......
......@@ -63,7 +63,7 @@ void StartServer(std::atomic<bool>* initialized) {
server_thread.join();
}
TEST(SendNcclId, Normal) {
TEST(SendNcclId, DISABLED_Normal) {
std::atomic<bool> initialized{false};
std::thread server_thread(StartServer, &initialized);
while (!initialized) {
......
......@@ -245,7 +245,6 @@ class DeviceTracerImpl : public DeviceTracer {
void Enable() {
std::lock_guard<std::mutex> l(trace_mu_);
if (enabled_) {
fprintf(stderr, "DeviceTracer already enabled\n");
return;
}
EnableActivity();
......
......@@ -116,6 +116,8 @@ void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path);
const int kEnableProfiler = 1;
const int kDisableProfiler = 2;
// Test if the profiler is currently enabled.
bool IsProfileEnabled();
// Whether the trainer should send profiling state to PS.
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/pybind/const_value.h"
#include <paddle/fluid/framework/op_proto_maker.h>
#include "paddle/fluid/framework/operator.h"
namespace paddle {
......@@ -23,6 +24,21 @@ void BindConstValue(pybind11::module* m) {
m->def("kTempVarName", [] { return framework::kTempVarName; });
m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
auto op_proto_and_checker_maker =
m->def_submodule("op_proto_and_checker_maker");
pybind11::enum_<framework::OpRole>(op_proto_and_checker_maker, "OpRole")
.value("Forward", framework::OpRole::kForward)
.value("Backward", framework::OpRole::kBackward)
.value("Optimize", framework::OpRole::kOptimize)
.value("Loss", framework::OpRole::kLoss);
op_proto_and_checker_maker.def(
"kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
op_proto_and_checker_maker.def(
"kOpRoleVarAttrName",
framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
}
} // namespace pybind
......
......@@ -52,9 +52,3 @@ add_simple_unittest(Im2ColTest)
add_simple_unittest(GemmConvOpTest)
add_simple_unittest(DepthwiseConvOpTest)
endif()
add_style_check_target(paddle_function ${h_files})
add_style_check_target(paddle_function ${cpp_files})
if(WITH_GPU)
add_style_check_target(paddle_function ${cu_files})
endif()
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include "paddle/function/EigenThreadDevice.h"
namespace paddle {
......@@ -70,25 +70,26 @@ struct EigenBlasGemm {
dims[0].first = transA ? 0 : 1;
dims[0].second = transB ? 1 : 0;
Eigen::DefaultDevice device;
auto* device = EigenDeviceWarpper::device();
if (N == ldc) {
if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims);
c.device(*device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.device(device) += a.contract(b, dims);
c.device(*device) += a.contract(b, dims);
} else {
c.device(device) = alpha * a.contract(b, dims) + beta * c;
c.device(*device) = alpha * a.contract(b, dims) + beta * c;
}
} else {
if (alpha == T(1) && beta == T(0)) {
c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
} else {
c.slice(offsetC, extentC).device(device) =
c.slice(offsetC, extentC).device(*device) =
alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
}
}
EigenDeviceWarpper::free_device(device);
}
};
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#if defined(__OSX__) || defined(__APPLE__)
#include <sys/sysctl.h>
#include <sys/types.h>
#endif
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
#if defined(__ANDROID__)
int GetCpuCount() {
FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
if (!fp) {
return 1;
}
int rank0, rank1;
int num = fscanf(fp, "%d-%d", &rank0, &rank1);
fclose(fp);
if (num < 2) return 1;
return rank1 + 1;
}
#elif defined(__OSX__) || defined(__APPLE__)
int GetCpuCount() {
int count = 0;
size_t len = sizeof(int);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
return count > 0 ? count : 1;
}
#else
int GetCpuCount() { return 1; }
#endif
class EigenDeviceWarpper {
public: // NOLINT
#if EIGEN_USE_THREADS
static Eigen::ThreadPoolDevice* device() {
const int num_cpus = GetCpuCount();
const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
static Eigen::ThreadPool tp(num_threads);
static Eigen::ThreadPoolDevice* device =
new Eigen::ThreadPoolDevice(&tp, num_threads);
return device;
}
static void free_device(Eigen::ThreadPoolDevice* device) {
// do nothing
}
#else
static Eigen::DefaultDevice* device() {
Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
return device;
}
static void free_device(Eigen::DefaultDevice* device) { delete device; }
#endif
};
} // namespace paddle
......@@ -146,8 +146,6 @@ else()
${GSERVER_SOURCES})
endif()
add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
add_style_check_target(paddle_gserver ${GSERVER_HEADER})
add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
if(WITH_TESTING)
add_subdirectory(tests)
......
......@@ -51,10 +51,6 @@ else()
endif()
add_style_check_target(paddle_math ${MATH_SOURCES})
add_style_check_target(paddle_math ${MATH_HEADERS})
add_dependencies(paddle_math paddle_proto ${external_project_dependencies}) # depends
if(WITH_TESTING)
add_subdirectory(tests)
......
......@@ -7,6 +7,10 @@ set(OPITMIZER_SRCS
sgd_optimizer.cc
)
cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
add_library(paddle_optimizer ${OPITMIZER_SRCS})
target_link_libraries(paddle_optimizer paddle_proto glog)
if (WITH_TESTING)
add_unittest(serialization_test serialization_test.cc)
add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
endif()
......@@ -5,8 +5,6 @@ file(GLOB PARAMETERS_SOURCES . *.cpp)
add_library(paddle_parameter STATIC
${PARAMETERS_SOURCES})
add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
if(WITH_TESTING)
add_subdirectory(tests)
......
......@@ -14,9 +14,6 @@ set(NETWORK_HEADERS
add_library(paddle_network STATIC
${NETWORK_SOURCES})
add_style_check_target(paddle_network ${NETWORK_SOURCES})
add_style_check_target(paddle_network ${NETWORK_HEADERS})
add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
################### paddle_pserver ######################
......@@ -37,9 +34,6 @@ set(PSERVER_HEADERS
add_library(paddle_pserver STATIC
${PSERVER_SOURCES})
add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
set(PSERVER_MAIN_SOURCES
......
#!/bin/bash
function cmake_gen() {
mkdir -p /paddle/build
cd /paddle/build
# build script will not fail if *.deb does not exist
rm *.deb 2>/dev/null || true
# delete previous built whl packages
rm -rf /paddle/paddle/dist 2>/dev/null || true
# Support build for all python versions, currently
# including cp27-cp27m and cp27-cp27mu.
PYTHON_FLAGS=""
if [ "$1" != "" ]; then
echo "using python abi: $1"
if [ "$1" == "cp27-cp27m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
elif [ "$1" == "cp27-cp27mu" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
fi
fi
cat <<EOF
========================================
Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
${PYTHON_FLAGS}
-DWITH_DSO=ON
-DWITH_DOC=${WITH_DOC:-OFF}
-DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-OFF}
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-DWITH_SWIG_PY=ON
-DWITH_C_API=${WITH_C_API:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON}
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-DWITH_TESTING=${WITH_TESTING:-ON}
-DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
# docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
${PYTHON_FLAGS} \
-DWITH_DSO=ON \
-DWITH_DOC=${WITH_DOC:-OFF} \
-DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-DWITH_C_API=${WITH_C_API:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
}
function run_build() {
cat <<EOF
============================================
Building in /paddle/build ...
============================================
EOF
make clean
make -j `nproc`
}
function run_test() {
if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
cat <<EOF
========================================
Running unit tests ...
========================================
EOF
ctest --output-on-failure
# make install should also be test when unittest
make install -j `nproc`
pip install /usr/local/opt/paddle/share/wheels/*.whl
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version
fi
fi
}
function gen_docs() {
if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
cat <<EOF
========================================
Building documentation ...
In /paddle/build_doc
========================================
EOF
mkdir -p /paddle/build_doc
pushd /paddle/build_doc
cmake .. \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \
-DWITH_AVX=${WITH_AVX:-ON} \
-DWITH_SWIG_PY=ON \
-DWITH_STYLE_CHECK=OFF
make -j `nproc` paddle_docs paddle_apis
popd
fi
if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
cat <<EOF
========================================
Converting C++ source code into HTML ...
========================================
EOF
export WOBOQ_OUT=/paddle/build/woboq_out
mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data
/woboq/generator/codebrowser_generator \
-b /paddle/build \
-a \
-o $WOBOQ_OUT \
-p paddle:/paddle
/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
fi
}
function gen_dockerfile() {
# Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
fi
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
cat <<EOF
========================================
Generate /paddle/build/Dockerfile ...
========================================
EOF
cat > /paddle/build/Dockerfile <<EOF
FROM ${BASE_IMAGE}
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV HOME /root
EOF
if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
else
NCCL_DEPS=""
fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
PADDLE_VERSION="paddle version"
CMD='"paddle", "version"'
else
PADDLE_VERSION="true"
CMD='"true"'
fi
cat >> /paddle/build/Dockerfile <<EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&\
${NCCL_DEPS}\
apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
pip install /*.whl; apt-get install -f -y && \
apt-get clean -y && \
rm -f /*.whl && \
${PADDLE_VERSION} && \
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
cat >> /paddle/build/Dockerfile <<EOF
ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/
EOF
fi
cat >> /paddle/build/Dockerfile <<EOF
# default command shows the paddle version and exit
CMD [${CMD}]
EOF
}
function gen_capi_package() {
if [[ ${WITH_C_API} == "ON" ]]; then
install_prefix="/paddle/build/capi_output"
rm -rf $install_prefix
make DESTDIR="$install_prefix" install
cd $install_prefix/usr/local
ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
fi
}
function gen_fluid_inference_lib() {
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF
========================================
Deploying fluid inference library ...
========================================
EOF
make -j `nproc` inference_lib_dist
fi
}
set -xe
cmake_gen ${PYTHON_ABI:-""}
run_build
run_test
gen_docs
gen_dockerfile
gen_capi_package
gen_fluid_inference_lib
if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
else
printf "If you need to install PaddlePaddle in develop docker image,"
printf "please make install or pip install build/python/dist/*.whl.\n"
fi
#!/bin/bash
set -xe
if [ $ANDROID_ABI == "arm64-v8a" ]; then
ANDROID_ARCH=arm64
if [ $ANDROID_API -lt 21 ]; then
echo "Warning: arm64-v8a requires ANDROID_API >= 21."
ANDROID_API=21
fi
else # armeabi, armeabi-v7a
ANDROID_ARCH=arm
fi
ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
cat <<EOF
============================================
Generating the standalone toolchain ...
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
--arch=$ANDROID_ARCH
--platform=android-$ANDROID_API
--install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
============================================
EOF
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
--arch=$ANDROID_ARCH \
--platform=android-$ANDROID_API \
--install-dir=$ANDROID_STANDALONE_TOOLCHAIN
BUILD_ROOT=/paddle/build_android
DEST_ROOT=/paddle/install_android
mkdir -p $BUILD_ROOT
cd $BUILD_ROOT
if [ $ANDROID_ABI == "armeabi-v7a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
elif [ $ANDROID_ABI == "arm64-v8a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DUSE_EIGEN_FOR_BLAS=OFF \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
elif [ $ANDROID_ABI == "armeabi" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
else
echo "Invalid ANDROID_ABI: $ANDROID_ABI"
fi
cat <<EOF
============================================
Building in $BUILD_ROOT ...
============================================
EOF
make -j `nproc`
make install -j `nproc`
#!/bin/bash
/usr/sbin/sshd -D &
jupyter notebook --ip=0.0.0.0 /paddle/book/
#!/bin/bash
set -e
# the number of process to run tests
NUM_PROC=6
# calculate and set the memory usage for each process
MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
# get the CUDA device count
CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
for (( i = 0; i < $NUM_PROC; i++ )); do
cuda_list=()
for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
s=$[i+j]
n=$[s%CUDA_DEVICE_COUNT]
if [ $j -eq 0 ]; then
cuda_list=("$n")
else
cuda_list="$cuda_list,$n"
fi
done
echo $cuda_list
# CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
# ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
done
wait
......@@ -99,12 +99,13 @@ function cmake_gen() {
-DWITH_PYTHON=${WITH_PYTHON:-ON}
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-DWITH_TESTING=${WITH_TESTING:-ON}
-DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=ON
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
......@@ -126,12 +127,12 @@ EOF
-DWITH_C_API=${WITH_C_API:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=ON
}
function abort(){
......@@ -231,7 +232,6 @@ EOF
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
elif [ $ANDROID_ABI == "arm64-v8a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
......@@ -245,7 +245,6 @@ EOF
-DUSE_EIGEN_FOR_BLAS=OFF \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
elif [ $ANDROID_ABI == "armeabi" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
......@@ -258,7 +257,6 @@ EOF
-DCMAKE_BUILD_TYPE=MinSizeRel \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
else
echo "Invalid ANDROID_ABI: $ANDROID_ABI"
......@@ -287,7 +285,6 @@ function build_ios() {
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_TESTING=OFF \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
-DCMAKE_BUILD_TYPE=Release
make -j 2
......@@ -375,8 +372,7 @@ EOF
-DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
-DWITH_STYLE_CHECK=OFF
-DWITH_MKL=OFF
make -j `nproc` paddle_docs paddle_apis
......@@ -415,9 +411,11 @@ function gen_dockerfile() {
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
DOCKERFILE_CUBLAS_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
fi
cat <<EOF
......@@ -458,6 +456,7 @@ EOF
${PADDLE_VERSION} && \
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_CUBLAS_DSO}
${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
......@@ -493,7 +492,9 @@ function gen_fluid_inference_lib() {
========================================
EOF
make -j `nproc` inference_lib_dist
tar -cf ${PADDLE_ROOT}/build/fluid.tgz ${PADDLE_ROOT}/build/fluid_install_dir
cd ${PADDLE_ROOT}/build
mv fluid_install_dir fluid
tar -cf fluid.tgz fluid
fi
}
......
#!/bin/bash
set -e
# Create the build directory for CMake.
mkdir -p $TRAVIS_BUILD_DIR/build
cd $TRAVIS_BUILD_DIR/build
# Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
make -j `nproc` paddle_docs paddle_apis
# check websites for broken links
linkchecker doc/v2/en/html/index.html
linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html
#!/bin/bash
set -e
# Create the build directory for CMake.
mkdir -p $TRAVIS_BUILD_DIR/build_ios
cd $TRAVIS_BUILD_DIR/build_ios
# Compile paddle binaries
cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \
-DCMAKE_OSX_ARCHITECTURES="arm64" \
-DWITH_C_API=ON \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_TESTING=OFF \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
-DCMAKE_BUILD_TYPE=Release \
..
make -j 2
#!/bin/bash
function abort(){
echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
echo "Please use pre-commit to check what is wrong." 1>&2
exit 1
}
trap 'abort' 0
set -e
# install glide
curl https://glide.sh/get | bash
eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
# set up go environment for running gometalinter
mkdir -p $GOPATH/src/github.com/PaddlePaddle/
ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
cd $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
go get github.com/alecthomas/gometalinter
gometalinter --install
cd $TRAVIS_BUILD_DIR
export PATH=/usr/bin:$PATH
pre-commit install
clang-format --version
if ! pre-commit run -a ; then
git diff
exit 1
fi
trap : 0
......@@ -36,17 +36,12 @@ endif()
add_library(paddle_trainer_lib STATIC
${TRAINER_SOURCES})
add_style_check_target(paddle_trainer_lib
${TRAINER_SOURCES})
add_style_check_target(paddle_trainer_lib
${TRAINER_HEADERS})
add_dependencies(paddle_trainer_lib
paddle_proto
${external_project_dependencies})
macro(add_paddle_exe TARGET_NAME)
add_executable(${TARGET_NAME} ${ARGN})
add_style_check_target(${TARGET_NAME} ${ARGN})
link_paddle_exe(${TARGET_NAME})
endmacro()
......
......@@ -14,9 +14,6 @@ add_library(paddle_utils STATIC
${UTIL_SOURCES}
${UTIL_ARCH_SOURCES}
${UTIL_RES})
add_style_check_target(paddle_utils ${UTIL_HEADERS})
add_style_check_target(paddle_utils ${UTIL_SOURCES}
${UTIL_ARCH_SOURCES})
add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
if(WITH_TESTING)
add_subdirectory(tests)
......
......@@ -48,6 +48,7 @@ from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, \
InferenceTranspiler, memory_optimize, release_memory
from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select)
from lod_tensor import create_lod_tensor, create_random_int_lodtensor
import clip
import profiler
import unique_name
......@@ -59,7 +60,7 @@ Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + [
parallel_executor.__all__ + lod_tensor.__all__ + [
'io',
'initializer',
'layers',
......
......@@ -51,6 +51,12 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
op_desc.set_input(para, args)
for para, args in outputs.iteritems():
op_desc.set_output(para, args)
op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
if op_role_attr_name not in attrs:
attrs[
op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
for name, val in attrs.iteritems():
if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc)
......@@ -335,9 +341,12 @@ def _append_backward_ops_(block,
no_grad_dict[block.idx])
# append op_desc in grad_op_descs to target_block
op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
backward = core.op_proto_and_checker_maker.OpRole.Backward
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc)
new_op_desc.set_attr(op_role_attr_name, backward)
grad_to_var["__current_op_desc__"] = new_op_desc
if callbacks is not None:
assert (isinstance(callbacks, list))
......@@ -439,6 +448,22 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
(list[(Variable,Variable)]): list of (parameter, gradient) pair.
"""
assert isinstance(loss, framework.Variable)
if loss.op is None:
# the loss is from a cloned program. Find loss op manually.
for op in reversed(loss.block.ops):
assert isinstance(op, framework.Operator)
if len(op.output_arg_names) == 1 and op.output_arg_names[
0] == loss.name:
loss.op = op
break
if loss.op is None:
raise ValueError("loss.op is None. Should not happend")
loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
int(core.op_proto_and_checker_maker.OpRole.Forward) |
int(core.op_proto_and_checker_maker.OpRole.Loss))
if callbacks is not None:
isinstance(callbacks, list)
......@@ -456,12 +481,16 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
current_block_idx = program.current_block_idx
grad_to_var = dict()
op_desc = _create_op_desc_("fill_constant", {}, {
"Out": [_append_grad_suffix_(loss.name)]
}, {"shape": [1],
op_desc = _create_op_desc_(
"fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
"shape": [1],
"value": 1.0,
"dtype": loss.dtype,
"force_cpu": False})
"force_cpu": False,
core.op_proto_and_checker_maker.kOpRoleAttrName():
int(core.op_proto_and_checker_maker.OpRole.Backward) |
int(core.op_proto_and_checker_maker.OpRole.Loss),
})
root_block.desc.append_op().copy_from(op_desc)
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
......@@ -505,6 +534,24 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
params_and_grads.append((param_var, grad_var))
else:
params_and_grads.append((param_var, None))
op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
for p, g in params_and_grads:
if g is None:
continue
for op in reversed(program.global_block().ops):
assert isinstance(op, framework.Operator)
if g.name in op.output_arg_names:
g.op = op
break
if g.op is None:
raise ValueError("Unexpected branch")
attr_val = [p.name, g.name]
if g.op.has_attr(op_role_var_attr_name):
attr_val.extend(g.op.attr(op_role_var_attr_name))
g.op.set_attr(op_role_var_attr_name, attr_val)
return params_and_grads
......
......@@ -214,21 +214,24 @@ def set_gradient_clip(clip, param_list=None, program=None):
def append_gradient_clip_ops(param_grad):
context = dict()
create_op_callbacks = []
for p, g in param_grad:
with p.block.program.optimized_guard(p):
clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
if clip_attr is None:
clip_attr = NullGradientClipAttr()
if not isinstance(clip_attr, BaseGradientClipAttr):
raise TypeError(
"clip attribute should be an instance of BaseGradientClipAttr")
"clip attribute should be an instance of BaseGradientClipAttr"
)
clip_attr.process_context(context=context, param=p, grad=g)
create_op_callbacks.append(
functools.partial(
clip_attr.create_operators, param=p, grad=g))
return [each_callback() for each_callback in create_op_callbacks]
res = []
for p, g in param_grad:
with p.block.program.optimized_guard(p):
res.append(clip_attr.create_operators(param=p, grad=g))
return res
ClipByValue = GradientClipByValue
......
......@@ -404,6 +404,23 @@ class Operator(object):
self.block = block
self.desc = desc
self.attrs = attrs
if self.attrs is None:
self.attrs = dict()
del attrs
op_maker = core.op_proto_and_checker_maker
if op_maker.kOpRoleAttrName() not in self.attrs:
self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
role_var_name = op_maker.kOpRoleVarAttrName()
if len(self.block.program.
op_role_var) != 0 and role_var_name not in self.attrs:
self.attrs[role_var_name] = self.block.program.op_role_var
if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0:
del self.attrs[role_var_name]
if len(self.desc.type()) != 0:
return
if type is None:
......@@ -469,22 +486,23 @@ class Operator(object):
arg.op = self
self.desc.set_output(out_proto.name, out_arg_names)
if attrs is not None:
if not isinstance(attrs, dict):
if self.attrs is not None:
if not isinstance(self.attrs, dict):
raise TypeError("'attrs' should be a dict.")
for attr in proto.attrs:
attr_name = attr.name
if (attr_name not in attrs) or (attrs[attr_name] is None):
if (attr_name not in self.attrs) or (
self.attrs[attr_name] is None):
continue
if isinstance(attrs[attr_name], Block):
self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
elif isinstance(attrs[attr_name], core.BlockDesc) or \
isinstance(attrs[attr_name], core.ProgramDesc):
if isinstance(self.attrs[attr_name], Block):
self.desc.set_block_attr(attr_name,
self.attrs[attr_name].desc)
elif isinstance(self.attrs[attr_name], core.BlockDesc) or \
isinstance(self.attrs[attr_name], core.ProgramDesc):
self.desc.set_serialized_attr(
attr_name, attrs[attr_name].serialize_to_string())
attr_name, self.attrs[attr_name].serialize_to_string())
else:
self.desc.set_attr(attr_name, attrs[attr_name])
self.desc.set_attr(attr_name, self.attrs[attr_name])
self.desc.check_attrs()
no_kernel_op_set = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
......@@ -612,6 +630,10 @@ class Operator(object):
"""
return self.desc.attr_type(name)
def set_attr(self, name, val):
self.attrs[name] = val
self.desc.set_attr(name, val)
@property
def attr_names(self):
"""
......@@ -1002,6 +1024,33 @@ class Program(object):
self.blocks = [Block(self, 0)]
self.current_block_idx = 0
self._seed = 0
self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
self._op_role_var = []
@property
def op_role(self):
return self._current_role
@op_role.setter
def set_op_role(self, role):
self._current_role = role
@property
def op_role_var(self):
return self._op_role_var
@op_role_var.setter
def set_op_role_var(self, var_name):
self._op_role_var = [var_name]
@contextlib.contextmanager
def optimized_guard(self, var):
OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.Optimize
self._op_role_var = [var.name if isinstance(var, Variable) else var]
yield
self._op_role_var = []
self._current_role = OpRole.Forward
def __str__(self):
return self.to_string(True)
......
......@@ -56,7 +56,7 @@ class Inferencer(object):
else:
self.exe = executor.Executor(self.place)
def infer(self, inputs):
def infer(self, inputs, return_numpy=True):
"""
:param inputs: a map of {"input_name": input_var} that will be feed into the inference program
to get the predict value
......@@ -66,9 +66,11 @@ class Inferencer(object):
raise ValueError(
"inputs should be a map of {'input_name': input_var}")
with self._prog_and_scope_guard():
results = self.exe.run(feed=inputs,
fetch_list=[self.predict_var.name])
with executor.scope_guard(self.scope):
results = self.exe.run(self.inference_program,
feed=inputs,
fetch_list=[self.predict_var],
return_numpy=return_numpy)
return results
......
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from .. import core
from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
......@@ -21,7 +22,8 @@ from ..executor import global_scope
__all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
'random_data_generator', 'Preprocessor'
]
......@@ -535,8 +537,6 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
inputs={'UnderlyingReader': reader},
outputs={'Out': [new_reader]},
attrs=attrs)
new_reader.persistable = True
new_reader.stop_gradient = True
return monkey_patch_reader_methods(new_reader)
......@@ -581,3 +581,82 @@ def read_file(file_obj):
return out[0]
else:
return out
class Preprocessor(object):
BEFORE_SUB_BLOCK = 0
IN_SUB_BLOCK = 1
AFTER_SUB_BLOCK = 2
def __init__(self, reader, name=None):
self.underlying_reader = reader
new_reader_name = name if name is not None else unique_name(
"create_custom_reader")
self.main_prog = default_main_program()
self.reader = self.main_prog.current_block().create_var(
name=new_reader_name)
self.sub_block = None
self.source_var_names = None
self.sink_var_names = None
self.status = Preprocessor.BEFORE_SUB_BLOCK
def is_completed(self):
return self.sub_block and self.source_var_names and self.sink_var_names
@contextlib.contextmanager
def block(self):
self.status = Preprocessor.IN_SUB_BLOCK
self.sub_block = self.main_prog.create_block()
yield
self.main_prog.rollback()
self.status = Preprocessor.AFTER_SUB_BLOCK
if not self.is_completed():
raise RuntimeError(
"The definition of preprocessor is incompleted! "
"Please make sure that you have set input and output "
"variables by invoking 'inputs' and 'outputs' in "
"Preprocessor's sub-block.")
def inputs(self):
if self.status != Preprocessor.IN_SUB_BLOCK:
raise RuntimeError(
"Preprocessor.inputs() can only be invoked inside the sub-block."
)
source_shapes = self.underlying_reader.desc.shapes()
source_dtypes = self.underlying_reader.desc.dtypes()
source_lod_levels = self.underlying_reader.desc.lod_levels()
self.source_var_names = [
unique_name("preprocessor_source")
for _ in xrange(len(source_shapes))
]
source_vars = []
for var_name, shape, dtype, lod_level in zip(
self.source_var_names, source_shapes, source_dtypes,
source_lod_levels):
source_vars.append(self.main_prog.current_block().create_var(
name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
return source_vars
def outputs(self, *outs):
if self.status != Preprocessor.IN_SUB_BLOCK:
raise RuntimeError(
"Preprocessor.outputs() can only be invoked inside the sub-block."
)
self.sink_var_names = [var.name for var in outs]
def __call__(self, *args, **kwargs):
if self.status != Preprocessor.AFTER_SUB_BLOCK:
raise RuntimeError(
"Preprocessor output can only be retrieved after rnn block.")
self.main_prog.current_block().append_op(
type="create_custom_reader",
inputs={'UnderlyingReader': self.underlying_reader},
outputs={'Out': [self.reader]},
attrs={
"sub_block": self.sub_block,
"source_var_names": self.source_var_names,
"sink_var_names": self.sink_var_names
})
return monkey_patch_reader_methods(self.reader)
......@@ -81,6 +81,7 @@ __all__ = [
'label_smooth',
'roi_pool',
'dice_loss',
'upsampling_bilinear2d',
]
......@@ -3852,6 +3853,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
(num_rois, channels, pooled_h, pooled_w).
Examples:
.. code-block:: python
pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
"""
helper = LayerHelper('roi_pool', **locals())
......@@ -3899,6 +3902,8 @@ def dice_loss(input, label, epsilon=0.00001):
dice_loss (Variable): The dice loss with shape [1].
Examples:
.. code-block:: python
predictions = fluid.layers.softmax(x)
loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
"""
......@@ -3910,3 +3915,66 @@ def dice_loss(input, label, epsilon=0.00001):
label, dim=reduce_dim)
dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
return reduce_mean(dice_score)
def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
"""
The mathematical meaning of upsampling_bilinear2d is also called
Bilinear interpolation.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this layer) on a rectilinear 2D grid.
For details, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation
Args:
input (Variable): The input tensor of bilinear interpolation,
This is a 4-D tensor of the shape
(num_batches, channels, in_h, in_w).
out_shape(list|tuple|None): Output shape of bilinear interpolation
layer, the shape is (out_h, out_w).
Default: None
scale(int|None): The multiplier for the input height or width.
At least one of out_shape or scale must be set.
And out_shape has a higher priority than scale.
Default: None
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
out (Variable): The output is a 4-D tensor of the shape
(num_batches, channls, out_h, out_w).
Examples:
.. code-block:: python
out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
"""
if out_shape is None and scale is None:
raise ValueError("One of out_shape and scale must not be None")
helper = LayerHelper('bilinear_interp', **locals())
dtype = helper.input_dtype()
def _is_list_or_turple_(data):
return (isinstance(data, list) or isinstance(data, tuple))
if out_shape is not None:
if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2):
raise ValueError('out_shape should be a list or tuple ',
'with length 2, (out_h, out_w).')
out_shape = list(map(int, out_shape))
out_h = out_shape[0]
out_w = out_shape[1]
else:
out_h = int(input.shape[2] * scale)
out_w = int(input.shape[3] * scale)
out = helper.create_tmp_variable(dtype)
helper.append_op(
type="bilinear_interp",
inputs={"X": input},
outputs={"Out": out},
attrs={"out_h": out_h,
"out_w": out_w})
return out
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import core
import numpy as np
__all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
def _validate_lod(lod, tensor_height=-1):
"""Check whether the input length-based lod info is valid.
There are several things to check:
1. lod should be a list of lists. Empty list is fine.
2. The length of each sublist (a lod level) should be at least one.
3. Each element in each lod level should be an integer greater than 0.
4. The sum of one lod level should be equal to the length of the next lod level.
5. The sum of the last lod level should be equal to the tensor height.
Bypass this check if user does not provide tensor_height as input.
Args:
lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]].
tensor_height: the outermost dimension of the tensor with which the input
lod is associated with.
Returns:
A boolean indicating whether the input lod is valid or not.
"""
assert isinstance(lod, list), "lod should be a list"
# Empty lod is fine
if len(lod) == 0:
return True
lod_sum = []
for level in lod:
assert isinstance(level, list), "each item in lod should be a list"
# Each level of lod should have at least one length info
if len(level) < 1:
return False
level_sum = 0
for lod_len in level:
# Each length in a level should be > 0
if lod_len <= 0:
return False
level_sum += lod_len
lod_sum.append(level_sum)
for idx, val in enumerate(lod_sum[:-1]):
# Each level's sum should be equal to
# the number of items in the next level
if val != len(lod[idx + 1]):
return False
if tensor_height == -1:
return True
else:
# Last level's sum should be equal to the tensor height
return lod_sum[-1] == tensor_height
def _convert_lod(lod):
"""Convert a length-based lod to a offset-based lod.
If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]],
then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]].
Args:
lod: a length-based lod info.
Returns:
A list of lists as the offset-based lod converted to from the input lod.
"""
new_lod = []
for level in lod:
cur_len = 0
new_level = [cur_len]
for lod_len in level:
cur_len += lod_len
new_level.append(cur_len)
new_lod.append(new_level)
return new_lod
def create_lod_tensor(data, lod, place):
"""Create a lod tensor from a numpy array, a list, or an existing lod tensor.
Create a lod tensor by doing the following:
1. Check that the length-based input lod is valid.
2. Convert the length-based lod to a offset-based LoD.
3. Copy the data from a numpy array, a list or a existing lod tensor to
CPU or GPU device (based on input place).
4. Set the level of detail (LoD) using the offset-based LoD.
Use example:
Suppose we want LoDTensor to hold data for sequences of word, where each word is
represented by an integer. If we want to create a LoDTensor to represent two
sentences, one of 2 words, and one of 3 words.
Then 'data' can be a numpy array of integers with shape (5, 1).
'lod' will be [[2, 3]], indicating the length(# of words) in each sentence.
This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]]
inside the function call.
Please refer to
github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md
for more details regarding LoD.
Args:
data: a numpy array or a LoDTensor or a list holding the data to be copied.
lod: a list of lists indicating the length-based LoD info specified by the user.
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
Returns:
A fluid LoDTensor object with tensor data and lod info.
"""
if isinstance(data, core.LoDTensor):
return create_lod_tensor(np.array(data), lod, place)
elif isinstance(data, list):
# When input data is a list, it only deal with the case where the base element
# is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
# LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
# of words or other indexes in the sequence.
new_lod = []
for seq in data:
new_lod.append(len(seq))
assert [new_lod] == lod, "data and lod do not match"
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
return create_lod_tensor(flattened_data, lod, place)
elif isinstance(data, np.ndarray):
assert _validate_lod(lod,
data.shape[0]), "the provided lod info is invalid"
tensor = core.LoDTensor()
tensor.set(data, place)
tensor.set_lod(_convert_lod(lod))
return tensor
else:
raise TypeError(
"data should be either a LoDTensor, a Numpy array or a list")
def create_random_int_lodtensor(lod, base_shape, place, low, high):
"""Create a LoDTensor containing random integers.
This function is frequently used in the book examples. So we revised it based on
the new create_lod_tensor API and put it here in the lod_tensor module to simplify
the code.
The function does the following:
1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input
and the shape of the basic element in 'base_shape'.
2. Create a numpy array of this shape.
3. Create the LoDTensor using create_lod_tensor API.
Suppose we want LoDTensor to hold data for sequences of word, where each word is
represented by an integer. If we want to create a LoDTensor to represent two
sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input
length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be
[5, 1], holding 5 words for two sentences.
Args:
data: a numpy array or a LoDTensor holding the data to be copied.
lod: a list of lists indicating the length-based LoD info specified by the user.
base_shape: the shape of the basic element to be held by the LoDTensor.
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
low: the lower bound of the random integers.
high: the upper bound of the random integers.
Returns:
A fluid LoDTensor object with tensor data and lod info.
"""
assert isinstance(base_shape, list), "base_shape should be a list"
converted_lod = _convert_lod(lod)
# append the total number of basic elements to the front of its shape
overall_shape = [converted_lod[-1][-1]] + base_shape
# the range of integer data elements is [low, high]
data = np.random.random_integers(low, high, overall_shape).astype("int64")
return create_lod_tensor(data, lod, place)
......@@ -213,6 +213,8 @@ class Optimizer(object):
optimize_ops = []
for param_and_grad in parameters_and_grads:
with param_and_grad[0].block.program.optimized_guard(
param_and_grad[0]):
if param_and_grad[0].trainable is True and param_and_grad[
1] is not None:
optimize_op = self._append_optimize_op(loss.block,
......
......@@ -43,6 +43,7 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
"""
params_and_grads = []
for param, grad in parameters_and_grads:
with param.block.program.optimized_guard(param):
# If no gradient then we don't need to do anything
if grad is None:
params_and_grads.append((param, grad))
......
......@@ -10,3 +10,7 @@ add_subdirectory(fit_a_line)
add_subdirectory(recognize_digits)
add_subdirectory(image_classification)
add_subdirectory(understand_sentiment)
add_subdirectory(label_semantic_roles)
add_subdirectory(word2vec)
add_subdirectory(recommender_system)
add_subdirectory(machine_translation)
......@@ -48,7 +48,7 @@ def linear():
return avg_loss
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
......@@ -68,8 +68,8 @@ def train(use_cuda, train_program, save_dirname):
['15.343549569447836']
...
'''
if save_dirname is not None:
trainer.save_params(save_dirname)
if params_dirname is not None:
trainer.save_params(params_dirname)
trainer.stop()
trainer.train(
......@@ -80,19 +80,19 @@ def train(use_cuda, train_program, save_dirname):
# infer
def infer(use_cuda, inference_program, save_dirname=None):
if save_dirname is None:
def infer(use_cuda, inference_program, params_dirname=None):
if params_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
batch_size = 10
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
results = inferencer.infer({'x': tensor_x})
print("infer results: ", numpy.array(results[0]))
print("infer results: ", results[0])
def main(use_cuda):
......@@ -100,10 +100,10 @@ def main(use_cuda):
return
# Directory for saving the trained model
save_dirname = "fit_a_line.inference.model"
params_dirname = "fit_a_line.inference.model"
train(use_cuda, linear, save_dirname)
infer(use_cuda, inference_program, save_dirname)
train(use_cuda, linear, params_dirname)
infer(use_cuda, inference_program, params_dirname)
class TestFitALine(unittest.TestCase):
......
......@@ -85,7 +85,7 @@ def train_network():
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
BATCH_SIZE = 128
EPOCH_NUM = 1
......@@ -105,8 +105,8 @@ def train(use_cuda, train_program, save_dirname):
print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
if save_dirname is not None:
trainer.save_params(save_dirname)
if params_dirname is not None:
trainer.save_params(params_dirname)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -122,10 +122,10 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -142,12 +142,14 @@ def main(use_cuda):
save_path = "image_classification_resnet.inference.model"
train(
use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
save_dirname=save_path)
params_dirname=save_path)
if __name__ == '__main__':
......
......@@ -64,7 +64,7 @@ def train_network():
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
BATCH_SIZE = 128
train_reader = paddle.batch(
paddle.reader.shuffle(
......@@ -82,8 +82,8 @@ def train(use_cuda, train_program, save_dirname):
print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
if save_dirname is not None:
trainer.save_params(save_dirname)
if params_dirname is not None:
trainer.save_params(params_dirname)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -99,10 +99,10 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -119,12 +119,14 @@ def main(use_cuda):
save_path = "image_classification_vgg.inference.model"
train(
use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
save_dirname=save_path)
params_dirname=save_path)
if __name__ == '__main__':
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
......@@ -16,21 +16,23 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy
import numpy as np
WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
WORD_DICT_LEN = len(WORD_DICT)
LABEL_DICT_LEN = len(LABEL_DICT)
PRED_DICT_LEN = len(VERB_DICT)
MARK_DICT_LEN = 2
IS_SPARSE = True
BATCH_SIZE = 10
EMBEDDING_NAME = 'emb'
def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark):
def lstm_net():
WORD_DIM = 32
MARK_DIM = 5
HIDDEN_DIM = 512
DEPTH = 8
EMBEDDING_NAME = 'emb'
# Data definitions
word = fluid.layers.data(
......@@ -69,8 +71,9 @@ def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark):
fluid.layers.embedding(
size=[WORD_DICT_LEN, WORD_DIM],
input=x,
param_attr=fluid.ParamAttr(
name=EMBEDDING_NAME, trainable=False)) for x in word_input
param_attr=fluid.ParamAttr(name=EMBEDDING_NAME))
for x in word_input
#name=EMBEDDING_NAME, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
......@@ -116,21 +119,16 @@ def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark):
return feature_out
def inference_network():
predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
mark)
crf_decode = fluid.layers.crf_decoding(
input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
def inference_program():
predict = lstm_net()
return crf_decode
return predict
def train_network():
def train_program():
MIX_HIDDEN_LR = 1e-3
predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
mark)
predict = lstm_net()
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf(
......@@ -140,68 +138,102 @@ def train_network():
name='crfw', learning_rate=MIX_HIDDEN_LR))
avg_cost = fluid.layers.mean(crf_cost)
return avg_cost
return [avg_cost]
def train(use_cuda, save_path):
BATCH_SIZE = 128
EPOCH_NUM = 1
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.train(), buf_size=8192),
batch_size=BATCH_SIZE)
trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer=optimizer)
feed_order = [
'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
'ctx_p2_data', 'verb_data', 'mark_data', 'target'
]
#embedding_param = fluid.global_scope().find_var(
# EMBEDDING_NAME).get_tensor()
#embedding_param.set(
# load_parameter(conll05.get_embedding(), WORD_DICT_LEN, WORD_DIM),
# place)
def event_handler(event):
if isinstance(event, fluid.EndEpochEvent):
test_reader = paddle.batch(
paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
avg_cost_set = trainer.test(
reader=test_reader, feed_order=feed_order)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
if (event.batch_id % 10) == 0:
avg_cost = trainer.test(reader=test_reader)
# get avg cost
avg_cost = np.array(avg_cost_set).mean()
print('BatchID {0:04}, Loss {1:2.2}'.format(event.batch_id + 1,
avg_cost))
print("avg_cost: %s" % avg_cost)
if avg_cost > 0.01: # Low threshold for speeding up CI
trainer.save_params(save_path)
return
if float(avg_cost) < 100.0: # Large value to increase CI speed
trainer.save_params(params_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
float(avg_cost)))
if math.isnan(float(avg_cost)):
sys.exit("got NaN loss, training failed.")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.01,
decay_steps=100000,
decay_rate=0.5,
staircase=True))
trainer = fluid.Trainer(train_network, optimizer=sgd_optimizer, place=place)
trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
elif isinstance(event, fluid.EndStepEvent):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE)
trainer.train(
num_epochs=1,
event_handler=event_handler,
reader=train_reader,
feed_order=feed_order)
def infer(use_cuda, save_path):
def infer(use_cuda, inference_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=save_path, place=place)
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high,
[lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
# Create an input example
lod = [0, 4, 10]
word = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
pred = create_random_lodtensor(lod, place, low=0, high=PRED_DICT_LEN - 1)
ctx_n2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_n1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_0 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
mark = create_random_lodtensor(lod, place, low=0, high=MARK_DICT_LEN - 1)
results = inferencer.infer({
inference_program, param_path=params_dirname, place=place)
# Setup inputs by creating LoDTensors to represent sequences of words.
# Here each word is the basic element of these LoDTensors and the shape of
# each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensors will have only
# one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists.
lod = [[3, 4, 2]]
base_shape = [1]
# The range of random integers is [low, high]
word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
pred = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
ctx_n2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_n1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_0 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
mark = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
results = inferencer.infer(
{
'word_data': word,
'verb_data': pred,
'ctx_n2_data': ctx_n2,
......@@ -210,17 +242,18 @@ def infer(use_cuda, save_path):
'ctx_p1_data': ctx_p1,
'ctx_p2_data': ctx_p2,
'mark_data': mark
})
},
return_numpy=False)
print("infer results: ", results)
print("infer results: ", np.array(results[0]))
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "label_semantic_roles.inference.model"
train(use_cuda, save_path)
infer(use_cuda, save_path)
params_dirname = "label_semantic_roles.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.framework as framework
import paddle.fluid.layers as pd
from paddle.fluid.executor import Executor
from functools import partial
import unittest
import os
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
hidden_dim = 32
word_dim = 16
batch_size = 2
max_length = 8
topk_size = 50
trg_dic_size = 10000
beam_size = 2
decoder_size = hidden_dim
def encoder(is_sparse):
# encoder
src_word_id = pd.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = pd.embedding(
input=src_word_id,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb'))
fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = pd.sequence_last_step(input=lstm_hidden0)
return encoder_out
def decoder_train(context, is_sparse):
# decoder
trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = pd.embedding(
input=trg_language_word,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb'))
rnn = pd.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(trg_embedding)
pre_state = rnn.memory(init=context)
current_state = pd.fc(input=[current_word, pre_state],
size=decoder_size,
act='tanh')
current_score = pd.fc(input=current_state,
size=target_dict_dim,
act='softmax')
rnn.update_memory(pre_state, current_state)
rnn.output(current_score)
return rnn()
def decoder_decode(context, is_sparse):
init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = pd.create_array('float32')
pd.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = pd.create_array('int64')
scores_array = pd.create_array('float32')
init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = pd.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
pd.array_write(init_ids, array=ids_array, i=counter)
pd.array_write(init_scores, array=scores_array, i=counter)
cond = pd.less_than(x=counter, y=array_len)
while_op = pd.While(cond=cond)
with while_op.block():
pre_ids = pd.array_read(array=ids_array, i=counter)
pre_state = pd.array_read(array=state_array, i=counter)
pre_score = pd.array_read(array=scores_array, i=counter)
# expand the lod of pre_state to be the same with pre_score
pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
pre_ids_emb = pd.embedding(
input=pre_ids,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=is_sparse)
# use rnn unit to update rnn
current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
size=decoder_size,
act='tanh')
current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
# use score to do beam search
current_score = pd.fc(input=current_state_with_lod,
size=target_dict_dim,
act='softmax')
topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
pd.increment(x=counter, value=1, in_place=True)
# update the memories
pd.array_write(current_state, array=state_array, i=counter)
pd.array_write(selected_ids, array=ids_array, i=counter)
pd.array_write(selected_scores, array=scores_array, i=counter)
pd.less_than(x=counter, y=array_len, cond=cond)
translation_ids, translation_scores = pd.beam_search_decode(
ids=ids_array, scores=scores_array)
# return init_ids, init_scores
return translation_ids, translation_scores
def set_init_lod(data, lod, place):
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod(lod)
return res
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def train_program(is_sparse):
context = encoder(is_sparse)
rnn_out = decoder_train(context, is_sparse)
label = pd.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = pd.cross_entropy(input=rnn_out, label=label)
avg_cost = pd.mean(cost)
return avg_cost
def train(use_cuda, is_sparse, is_local=True):
EPOCH_NUM = 1
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
feed_order = [
'src_word_id', 'target_language_word', 'target_language_next_word'
]
def event_handler(event):
if isinstance(event, fluid.EndStepEvent):
print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
if event.step == 10:
trainer.stop()
trainer = fluid.Trainer(
train_func=partial(train_program, is_sparse),
optimizer=fluid.optimizer.Adagrad(
learning_rate=1e-4,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.1)),
place=place)
trainer.train(
reader=train_reader,
num_epochs=EPOCH_NUM,
event_handler=event_handler,
feed_order=feed_order)
def decode_main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse)
translation_ids, translation_scores = decoder_decode(context, is_sparse)
exe = Executor(place)
exe.run(framework.default_startup_program())
init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [i for i in range(batch_size)] + [batch_size]
init_lod = [init_lod, init_lod]
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
for _, data in enumerate(train_data()):
init_ids = set_init_lod(init_ids_data, init_lod, place)
init_scores = set_init_lod(init_scores_data, init_lod, place)
src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
result_ids, result_scores = exe.run(
framework.default_main_program(),
feed={
'src_word_id': src_word_data,
'init_ids': init_ids,
'init_scores': init_scores
},
fetch_list=[translation_ids, translation_scores],
return_numpy=False)
print result_ids.lod()
break
class TestMachineTranslation(unittest.TestCase):
pass
@contextlib.contextmanager
def scope_prog_guard():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
def inject_test_train(use_cuda, is_sparse):
f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
train(use_cuda, is_sparse)
setattr(TestMachineTranslation, f_name, f)
def inject_test_decode(use_cuda, is_sparse, decorator=None):
f_name = 'test_{0}_{1}_decode'.format('cuda'
if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
decode_main(use_cuda, is_sparse)
if decorator is not None:
f = decorator(f)
setattr(TestMachineTranslation, f_name, f)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
inject_test_train(_use_cuda_, _is_sparse_)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
_decorator_ = None
if _use_cuda_:
_decorator_ = unittest.skip(
reason='Beam Search does not support CUDA!')
inject_test_decode(
is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
if __name__ == '__main__':
unittest.main()
......@@ -57,7 +57,7 @@ def train_program():
return [avg_cost, acc]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
......@@ -78,7 +78,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.epoch + 1, avg_cost, acc))
......@@ -100,11 +100,11 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -112,21 +112,21 @@ def infer(use_cuda, inference_program, save_dirname=None):
results = inferencer.infer({'img': tensor_img})
print("infer results: ", numpy.array(results[0]))
print("infer results: ", results[0])
def main(use_cuda):
save_dirname = "recognize_digits_conv.inference.model"
params_dirname = "recognize_digits_conv.inference.model"
# call train() with is_local argument to run distributed train
train(
use_cuda=use_cuda,
train_program=train_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
if __name__ == '__main__':
......
......@@ -44,7 +44,7 @@ def train_program():
return [avg_cost, acc]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
......@@ -62,7 +62,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.epoch + 1, avg_cost, acc))
......@@ -81,11 +81,11 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
infer_func=inference_program, param_path=params_dirname, place=place)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -93,21 +93,21 @@ def infer(use_cuda, inference_program, save_dirname=None):
results = inferencer.infer({'img': tensor_img})
print("infer results: ", numpy.array(results[0]))
print("infer results: ", results[0])
def main(use_cuda):
save_dirname = "recognize_digits_mlp.inference.model"
params_dirname = "recognize_digits_mlp.inference.model"
# call train() with is_local argument to run distributed train
train(
use_cuda=use_cuda,
train_program=train_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
save_dirname=save_dirname)
params_dirname=params_dirname)
if __name__ == '__main__':
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import sys
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.nets as nets
IS_SPARSE = True
USE_GPU = False
BATCH_SIZE = 256
def get_usr_combined_features():
# FIXME(dzh) : old API integer_value(10) may have range check.
# currently we don't have user configurated check.
USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
uid = layers.data(name='user_id', shape=[1], dtype='int64')
usr_emb = layers.embedding(
input=uid,
dtype='float32',
size=[USR_DICT_SIZE, 32],
param_attr='user_table',
is_sparse=IS_SPARSE)
usr_fc = layers.fc(input=usr_emb, size=32)
USR_GENDER_DICT_SIZE = 2
usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
usr_gender_emb = layers.embedding(
input=usr_gender_id,
size=[USR_GENDER_DICT_SIZE, 16],
param_attr='gender_table',
is_sparse=IS_SPARSE)
usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
usr_age_emb = layers.embedding(
input=usr_age_id,
size=[USR_AGE_DICT_SIZE, 16],
is_sparse=IS_SPARSE,
param_attr='age_table')
usr_age_fc = layers.fc(input=usr_age_emb, size=16)
USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
usr_job_emb = layers.embedding(
input=usr_job_id,
size=[USR_JOB_DICT_SIZE, 16],
param_attr='job_table',
is_sparse=IS_SPARSE)
usr_job_fc = layers.fc(input=usr_job_emb, size=16)
concat_embed = layers.concat(
input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
return usr_combined_features
def get_mov_combined_features():
MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
mov_emb = layers.embedding(
input=mov_id,
dtype='float32',
size=[MOV_DICT_SIZE, 32],
param_attr='movie_table',
is_sparse=IS_SPARSE)
mov_fc = layers.fc(input=mov_emb, size=32)
CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
category_id = layers.data(
name='category_id', shape=[1], dtype='int64', lod_level=1)
mov_categories_emb = layers.embedding(
input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
mov_categories_hidden = layers.sequence_pool(
input=mov_categories_emb, pool_type="sum")
MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
mov_title_id = layers.data(
name='movie_title', shape=[1], dtype='int64', lod_level=1)
mov_title_emb = layers.embedding(
input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
mov_title_conv = nets.sequence_conv_pool(
input=mov_title_emb,
num_filters=32,
filter_size=3,
act="tanh",
pool_type="sum")
concat_embed = layers.concat(
input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
# FIXME(dzh) : need tanh operator
mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
return mov_combined_features
def inference_program():
usr_combined_features = get_usr_combined_features()
mov_combined_features = get_mov_combined_features()
inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
scale_infer = layers.scale(x=inference, scale=5.0)
return scale_infer
def train_program():
scale_infer = inference_program()
label = layers.data(name='score', shape=[1], dtype='float32')
square_cost = layers.square_error_cost(input=scale_infer, label=label)
avg_cost = layers.mean(square_cost)
return [avg_cost, scale_infer]
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.SGD(learning_rate=0.2)
trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer=optimizer)
feed_order = [
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
'movie_title', 'score'
]
def event_handler(event):
if isinstance(event, fluid.EndStepEvent):
test_reader = paddle.batch(
paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
avg_cost_set = trainer.test(
reader=test_reader, feed_order=feed_order)
# get avg cost
avg_cost = np.array(avg_cost_set).mean()
print("avg_cost: %s" % avg_cost)
if float(avg_cost) < 4: # Smaller value to increase CI speed
trainer.save_params(params_dirname)
trainer.stop()
else:
print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
float(avg_cost)))
if math.isnan(float(avg_cost)):
sys.exit("got NaN loss, training failed.")
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.movielens.train(), buf_size=8192),
batch_size=BATCH_SIZE)
trainer.train(
num_epochs=1,
event_handler=event_handler,
reader=train_reader,
feed_order=feed_order)
def infer(use_cuda, inference_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=params_dirname, place=place)
# Use the first data from paddle.dataset.movielens.test() as input.
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
# where `data` is a list of sequences of index numbers, `lod` is
# the level of detail (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
# indicating that `data` consists of two sequences of length 3 and 2.
user_id = fluid.create_lod_tensor([[1]], [[1]], place)
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
age_id = fluid.create_lod_tensor([[0]], [[1]], place)
job_id = fluid.create_lod_tensor([[10]], [[1]], place)
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
place)
results = inferencer.infer(
{
'user_id': user_id,
'gender_id': gender_id,
'age_id': age_id,
'job_id': job_id,
'movie_id': movie_id,
'category_id': category_id,
'movie_title': movie_title
},
return_numpy=False)
print("infer results: ", np.array(results[0]))
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
params_dirname = "recommender_system.inference.model"
train(
use_cuda=use_cuda,
train_program=train_program,
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
params_dirname=params_dirname)
if __name__ == '__main__':
main(USE_GPU)
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# This test is buggy
# py_test(test_understand_sentiment_dynamic_rnn SRCS
# test_understand_sentiment_dynamic_rnn.py SERIAL)
LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
......
......@@ -64,7 +64,7 @@ def train_program(word_dict):
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
......@@ -85,7 +85,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
else:
......@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
......@@ -112,26 +112,30 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['words', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
word_dict = paddle.dataset.imdb.word_dict()
inferencer = fluid.Inferencer(
infer_func=partial(inference_program, word_dict),
param_path=save_dirname,
param_path=params_dirname,
place=place)
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high,
[lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
lod = [0, 4, 10]
tensor_words = create_random_lodtensor(
lod, place, low=0, high=len(word_dict) - 1)
# Setup input by creating LoDTensor to represent sequence of words.
# Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists.
lod = [[3, 4, 2]]
base_shape = [1]
# The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=len(word_dict) - 1)
results = inferencer.infer({'words': tensor_words})
print("infer results: ", results)
......@@ -139,9 +143,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, save_path)
infer(use_cuda, inference_program, save_path)
params_dirname = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
......@@ -79,7 +79,7 @@ def train_program(word_dict):
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
......@@ -100,7 +100,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
else:
......@@ -112,7 +112,7 @@ def train(use_cuda, train_program, save_dirname):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
......@@ -127,26 +127,30 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['words', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
word_dict = paddle.dataset.imdb.word_dict()
inferencer = fluid.Inferencer(
infer_func=partial(inference_program, word_dict),
param_path=save_dirname,
param_path=params_dirname,
place=place)
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high,
[lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
lod = [0, 4, 10]
tensor_words = create_random_lodtensor(
lod, place, low=0, high=len(word_dict) - 1)
# Setup input by creating LoDTensor to represent sequence of words.
# Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists.
lod = [[3, 4, 2]]
base_shape = [1]
# The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=len(word_dict) - 1)
results = inferencer.infer({'words': tensor_words})
print("infer results: ", results)
......@@ -154,9 +158,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, save_path)
infer(use_cuda, inference_program, save_path)
params_dirname = "understand_sentiment_conv.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
......@@ -71,7 +71,7 @@ def train_program(word_dict):
return [avg_cost, accuracy]
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
......@@ -92,7 +92,7 @@ def train(use_cuda, train_program, save_dirname):
print("acc : %s" % acc)
if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
else:
......@@ -104,7 +104,7 @@ def train(use_cuda, train_program, save_dirname):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(np.array, event.metrics)))
if event.step == 1: # Run 2 iterations to speed CI
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
train_reader = paddle.batch(
......@@ -119,26 +119,30 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['words', 'label'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
word_dict = paddle.dataset.imdb.word_dict()
inferencer = fluid.Inferencer(
infer_func=partial(inference_program, word_dict),
param_path=save_dirname,
param_path=params_dirname,
place=place)
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high,
[lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
lod = [0, 4, 10]
tensor_words = create_random_lodtensor(
lod, place, low=0, high=len(word_dict) - 1)
# Setup input by creating LoDTensor to represent sequence of words.
# Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists.
lod = [[3, 4, 2]]
base_shape = [1]
# The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=len(word_dict) - 1)
results = inferencer.infer({'words': tensor_words})
print("infer results: ", results)
......@@ -146,9 +150,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "understand_sentiment_stacked_lstm.inference.model"
train(use_cuda, train_program, save_path)
infer(use_cuda, inference_program, save_path)
params_dirname = "understand_sentiment_stacked_lstm.inference.model"
train(use_cuda, train_program, params_dirname)
infer(use_cuda, inference_program, params_dirname)
if __name__ == '__main__':
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
......@@ -25,16 +25,6 @@ HIDDEN_SIZE = 256
N = 5
BATCH_SIZE = 32
def create_random_lodtensor(lod, place, low, high):
# The range of data elements is [low, high]
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
......@@ -90,7 +80,7 @@ def train_program(is_sparse):
return avg_cost
def train(use_cuda, train_program, save_dirname):
def train(use_cuda, train_program, params_dirname):
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
test_reader = paddle.batch(
......@@ -107,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
print("loss= ", avg_cost)
if avg_cost < 10.0:
trainer.save_params(save_dirname)
trainer.save_params(params_dirname)
trainer.stop()
if math.isnan(avg_cost):
......@@ -125,16 +115,28 @@ def train(use_cuda, train_program, save_dirname):
feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
def infer(use_cuda, inference_program, save_dirname=None):
def infer(use_cuda, inference_program, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=save_dirname, place=place)
lod = [0, 1]
first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
infer_func=inference_program, param_path=params_dirname, place=place)
# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
# is simply an index to look up for the corresponding word vector and hence
# the shape of word (base_shape) should be [1]. The length-based level of
# detail (lod) info of each LoDtensor should be [[1]] meaning there is only
# one lod_level and there is only one sequence of one word on this level.
# Note that lod info should be a list of lists.
lod = [[1]]
base_shape = [1]
# The range of random integers is [low, high]
first_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
second_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
third_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
fourth_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
result = inferencer.infer(
{
......@@ -151,17 +153,17 @@ def main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "word2vec.inference.model"
params_dirname = "word2vec.inference.model"
train(
use_cuda=use_cuda,
train_program=partial(train_program, is_sparse),
save_dirname=save_path)
params_dirname=params_dirname)
infer(
use_cuda=use_cuda,
inference_program=partial(inference_program, is_sparse),
save_dirname=save_path)
params_dirname=params_dirname)
if __name__ == '__main__':
......
......@@ -125,14 +125,6 @@ def stacked_lstm_net(data,
return avg_cost, accuracy, prediction
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(word_dict,
net_method,
use_cuda,
......@@ -242,9 +234,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
word_dict_len = len(word_dict)
lod = [0, 4, 10]
tensor_words = create_random_lodtensor(
lod, place, low=0, high=word_dict_len - 1)
# Setup input by creating LoDTensor to represent sequence of words.
# Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists.
lod = [[3, 4, 2]]
base_shape = [1]
# The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
......
......@@ -116,29 +116,6 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
return feature_out
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(use_cuda, save_dirname=None, is_local=True):
# define network topology
word = fluid.layers.data(
......@@ -271,23 +248,35 @@ def infer(use_cuda, save_dirname=None):
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
lod = [0, 4, 10]
word = create_random_lodtensor(
lod, place, low=0, high=word_dict_len - 1)
pred = create_random_lodtensor(
lod, place, low=0, high=pred_dict_len - 1)
ctx_n2 = create_random_lodtensor(
lod, place, low=0, high=word_dict_len - 1)
ctx_n1 = create_random_lodtensor(
lod, place, low=0, high=word_dict_len - 1)
ctx_0 = create_random_lodtensor(
lod, place, low=0, high=word_dict_len - 1)
ctx_p1 = create_random_lodtensor(
lod, place, low=0, high=word_dict_len - 1)
ctx_p2 = create_random_lodtensor(
lod, place, low=0, high=word_dict_len - 1)
mark = create_random_lodtensor(
lod, place, low=0, high=mark_dict_len - 1)
# Setup inputs by creating LoDTensors to represent sequences of words.
# Here each word is the basic element of these LoDTensors and the shape of
# each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensors will have only
# one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists.
lod = [[3, 4, 2]]
base_shape = [1]
# The range of random integers is [low, high]
word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1)
pred = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=pred_dict_len - 1)
ctx_n2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1)
ctx_n1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1)
ctx_0 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1)
ctx_p1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1)
ctx_p2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1)
mark = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=mark_dict_len - 1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
......
......@@ -173,62 +173,32 @@ def train(use_cuda, save_dirname, is_local=True):
test_reader = paddle.batch(
paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
feeding = {
'user_id': 0,
'gender_id': 1,
'age_id': 2,
'job_id': 3,
'movie_id': 4,
'category_id': 5,
'movie_title': 6,
'score': 7
}
def func_feed(feeding, data):
feed_tensors = {}
for (key, idx) in feeding.iteritems():
tensor = fluid.LoDTensor()
if key != "category_id" and key != "movie_title":
if key == "score":
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"float32")
else:
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"int64")
else:
numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
data)
lod_info = [len(item) for item in numpy_data]
offset = 0
lod = [offset]
for item in lod_info:
offset += item
lod.append(offset)
numpy_data = np.concatenate(numpy_data, axis=0)
tensor.set_lod([lod])
numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
tensor.set(numpy_data, place)
feed_tensors[key] = tensor
return feed_tensors
feed_order = [
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
'movie_title', 'score'
]
def train_loop(main_program):
exe.run(framework.default_startup_program())
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for batch_id, data in enumerate(train_reader()):
# train a mini-batch
outs = exe.run(program=main_program,
feed=func_feed(feeding, data),
feed=feeder.feed(data),
fetch_list=[avg_cost])
out = np.array(outs[0])
if (batch_id + 1) % 10 == 0:
avg_cost_set = []
for test_data in test_reader():
avg_cost_np = exe.run(
program=test_program,
feed=func_feed(feeding, test_data),
avg_cost_np = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[avg_cost])
avg_cost_set.append(avg_cost_np[0])
break # test only 1 segment for speeding up CI
......@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
def create_lod_tensor(data, lod=None):
tensor = fluid.LoDTensor()
if lod is None:
# Tensor, the shape is [batch_size, 1]
index = 0
lod_0 = [index]
for l in range(len(data)):
index += 1
lod_0.append(index)
lod = [lod_0]
tensor.set_lod(lod)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
tensor.set(flattened_data, place)
return tensor
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc,
......@@ -307,26 +260,33 @@ def infer(use_cuda, save_dirname=None):
# Use the first data from paddle.dataset.movielens.test() as input
assert feed_target_names[0] == "user_id"
user_id = create_lod_tensor([[1]])
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
# where `data` is a list of sequences of index numbers, `lod` is
# the level of detail (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
# indicating that `data` consists of two sequences of length 3 and 2.
user_id = fluid.create_lod_tensor([[1]], [[1]], place)
assert feed_target_names[1] == "gender_id"
gender_id = create_lod_tensor([[1]])
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
assert feed_target_names[2] == "age_id"
age_id = create_lod_tensor([[0]])
age_id = fluid.create_lod_tensor([[0]], [[1]], place)
assert feed_target_names[3] == "job_id"
job_id = create_lod_tensor([[10]])
job_id = fluid.create_lod_tensor([[10]], [[1]], place)
assert feed_target_names[4] == "movie_id"
movie_id = create_lod_tensor([[783]])
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
assert feed_target_names[5] == "category_id"
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
assert feed_target_names[6] == "movie_title"
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
[[0, 5]])
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
[[5]], place)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
......
......@@ -21,15 +21,6 @@ import math
import sys
def create_random_lodtensor(lod, place, low, high):
# The range of data elements is [low, high]
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
PASS_NUM = 100
EMBED_SIZE = 32
......@@ -175,16 +166,23 @@ def infer(use_cuda, save_dirname=None):
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
# Setup inputs, by creating 4 words, the lod of which should be [0, 1]
lod = [0, 1]
first_word = create_random_lodtensor(
lod, place, low=0, high=dict_size - 1)
second_word = create_random_lodtensor(
lod, place, low=0, high=dict_size - 1)
third_word = create_random_lodtensor(
lod, place, low=0, high=dict_size - 1)
fourth_word = create_random_lodtensor(
lod, place, low=0, high=dict_size - 1)
# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
# is simply an index to look up for the corresponding word vector and hence
# the shape of word (base_shape) should be [1]. The length-based level of
# detail (lod) info of each LoDtensor should be [[1]] meaning there is only
# one lod_level and there is only one sequence of one word on this level.
# Note that lod info should be a list of lists.
lod = [[1]]
base_shape = [1]
# The range of random integers is [low, high]
first_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
second_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
third_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
fourth_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1)
assert feed_target_names[0] == 'firstw'
assert feed_target_names[1] == 'secondw'
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod
import numpy
import unittest
class TestLoDTensor(unittest.TestCase):
def test_validate_lod(self):
lod = (1, 2, 1)
self.assertRaises(AssertionError, _validate_lod, lod, -1)
lod = [[1, 2], (2, 3)]
self.assertRaises(AssertionError, _validate_lod, lod, -1)
lod = [1, 2, 3]
self.assertRaises(AssertionError, _validate_lod, lod, -1)
lod = []
self.assertTrue(_validate_lod(lod, -1))
lod = [[], [1], [3]]
self.assertFalse(_validate_lod(lod, -1))
lod = [[0], [-1], [3]]
self.assertFalse(_validate_lod(lod, -1))
# Each level's sum should be equal to the number of items in the next level
# Moreover, last level's sum should be equal to the tensor height
lod = [[2, 3], [1, 3, 1, 2, 1]]
self.assertTrue(_validate_lod(lod, tensor_height=8))
lod = [[1, 3], [2, 1, 3]]
self.assertFalse(_validate_lod(lod, tensor_height=6))
lod = [[1, 3], [2, 1, 3, 4]]
self.assertFalse(_validate_lod(lod, tensor_height=5))
def test_convert_lod(self):
lod = [[1, 2, 3]]
converted_lod = [[0, 1, 3, 6]]
self.assertEqual(_convert_lod(lod), converted_lod)
lod = [[2, 3], [1, 3, 1, 2, 1]]
converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]]
self.assertEqual(_convert_lod(lod), converted_lod)
def test_create_lod_tensor(self):
# Create LoDTensor from a list
data = [[1, 2, 3], [3, 4]]
wrong_lod = [[2, 2]]
correct_lod = [[3, 2]]
self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
fluid.CPUPlace())
tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
self.assertEqual(tensor.lod(), [[0, 3, 5]])
# Create LoDTensor from numpy array
data = numpy.random.random([10, 1])
lod = [[2, 1], [3, 3, 4]]
tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
# Create LoDTensor from another LoDTensor, they are differnt instances
new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]])
def test_create_random_int_lodtensor(self):
# The shape of a word, commonly used in speech and NLP problem, is [1]
shape = [1]
lod = [[2, 3, 5]]
dict_size = 10000
low = 0
high = dict_size - 1
tensor = create_random_int_lodtensor(lod, shape,
fluid.CPUPlace(), low, high)
self.assertEqual(tensor.lod(), [[0, 2, 5, 10]])
self.assertEqual(tensor.shape(), [10, 1])
if __name__ == '__main__':
unittest.main()
......@@ -17,7 +17,7 @@ endif(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
......@@ -39,74 +39,12 @@ function(py_test_modules TARGET_NAME)
endif()
endif()
endfunction()
list(REMOVE_ITEM TEST_OPS test_sequence_expand)
# test time consuming OPs in a separate process for expliot parallism
list(REMOVE_ITEM TEST_OPS test_parallel_executor)
list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
list(REMOVE_ITEM TEST_OPS test_mul_op)
# tests that need to be run in separate process.
list(REMOVE_ITEM TEST_OPS test_multihead_attention)
list(REMOVE_ITEM TEST_OPS test_calc_gradient)
list(REMOVE_ITEM TEST_OPS test_while_op)
list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
list(REMOVE_ITEM TEST_OPS test_profiler)
list(REMOVE_ITEM TEST_OPS test_nvprof)
list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
list(REMOVE_ITEM TEST_OPS test_assign_value_op)
list(REMOVE_ITEM TEST_OPS test_array_read_write_op)
list(REMOVE_ITEM TEST_OPS test_lod_rank_table)
list(REMOVE_ITEM TEST_OPS test_weight_normalization)
list(REMOVE_ITEM TEST_OPS test_conditional_block)
list(REMOVE_ITEM TEST_OPS test_parameter)
list(REMOVE_ITEM TEST_OPS test_registry)
list(REMOVE_ITEM TEST_OPS test_fetch_var)
list(REMOVE_ITEM TEST_OPS test_parallel_op)
list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input)
list(REMOVE_ITEM TEST_OPS test_dist_train)
list(REMOVE_ITEM TEST_OPS test_network_with_dtype)
# tests that can be bundled together in one python process for speed.
if(WITH_FAST_BUNDLE_TEST)
py_test_modules("test_all_ops" MODULES ${TEST_OPS})
else()
foreach(TEST_OP ${TEST_OPS})
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
endif(WITH_FAST_BUNDLE_TEST)
#
py_test_modules(test_sequence_expand MODULES test_sequence_expand)
# tests with high overhead
py_test_modules(test_parallel_executor MODULES test_parallel_executor)
endforeach(TEST_OP)
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
py_test_modules(test_mul_op MODULES test_mul_op)
py_test_modules(test_network_with_dtype MODULES test_network_with_dtype)
# tests that need to be run in separate process.
py_test_modules(test_multihead_attention MODULES test_multihead_attention)
py_test_modules(test_calc_gradient MODULES test_calc_gradient)
py_test_modules(test_while_op MODULES test_while_op)
py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
py_test_modules(test_profiler MODULES test_profiler)
py_test_modules(test_nvprof MODULES test_nvprof)
py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
py_test_modules(test_assign_value_op MODULES test_assign_value_op)
py_test_modules(test_array_read_write_op MODULES test_array_read_write_op)
py_test_modules(test_lod_rank_table MODULES test_lod_rank_table)
py_test_modules(test_weight_normalization MODULES test_weight_normalization)
py_test_modules(test_conditional_block MODULES test_conditional_block)
py_test_modules(test_parameter MODULES test_parameter)
py_test_modules(test_registry MODULES test_registry)
py_test_modules(test_fetch_var MODULES test_fetch_var)
py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
py_test_modules(test_parallel_op MODULES test_parallel_op)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
......@@ -36,6 +36,12 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
def create_op(scope, op_type, inputs, outputs, attrs):
kwargs = dict()
op_maker = core.op_proto_and_checker_maker
op_role_attr_name = op_maker.kOpRoleAttrName()
if op_role_attr_name not in attrs:
attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
def __create_var__(name, var_name):
scope.var(var_name).get_tensor()
kwargs[name].append(var_name)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import time
import numpy as np
__all__ = ['TestParallelExecutorBase']
class TestParallelExecutorBase(unittest.TestCase):
def check_network_convergence(self,
method,
memory_opt=True,
iter=50,
batch_size=None,
allow_op_delay=False,
feed_dict=None,
seed=None,
use_parallel_executor=True,
balance_parameter_opt_between_cards=False):
def run_executor(exe, feed, fetch_list, program=None):
if isinstance(exe, fluid.ParallelExecutor):
res = exe.run(fetch_list=fetch_list, feed=feed)
elif isinstance(exe, fluid.Executor):
if program is None:
program = fluid.default_main_program()
res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
else:
raise ValueError('Unkown type exe')
return res
main = fluid.Program()
startup = fluid.Program()
startup.random_seed = 1 # Fix random seed
with fluid.program_guard(main, startup):
if seed is not None:
startup.random_seed = seed
loss = method(use_feed=feed_dict is not None)
adam = fluid.optimizer.Adam()
adam.minimize(loss)
if memory_opt:
fluid.memory_optimize(main)
place = fluid.CUDAPlace(0)
startup_exe = fluid.Executor(place)
startup_exe.run(startup)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.allow_op_delay = allow_op_delay
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
if use_parallel_executor:
exe = fluid.ParallelExecutor(
True,
loss_name=loss.name,
exec_strategy=exec_strategy,
build_strategy=build_strategy)
else:
exe = fluid.Executor(place=place)
if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count()
begin = time.time()
first_loss, = run_executor(
exe=exe, feed=feed_dict, fetch_list=[loss.name])
first_loss = np.array(first_loss)
for i in xrange(iter):
run_executor(exe=exe, feed=feed_dict, fetch_list=[])
last_loss, = run_executor(
exe=exe, feed=feed_dict, fetch_list=[loss.name])
end = time.time()
if batch_size is not None:
print "%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin))
last_loss = np.array(last_loss)
print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
......@@ -12,19 +12,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import unittest
from multiprocessing import Process
import numpy
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.layers as layers
import numpy
from multiprocessing import Process
from threading import Thread
import os, sys
import time
class TestSendOp(unittest.TestCase):
@unittest.skip(
"This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
)
def test_send(self):
# Run init_serv in a thread
place = fluid.CPUPlace()
......
......@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
self.check_output()
class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
def setUp(self):
self.op_type = "fill_constant_batch_size_like"
self.inputs = {
'Input': (np.random.random((31, 28)).astype("float32"),
[[0, 9, 23, 31]])
}
self.attrs = {
'value': 3.5,
'shape': [-1, 16],
'input_dim_idx': 0,
'output_dim_idx': 0
}
out = np.random.random((3, 16)).astype("float32")
out.fill(3.5)
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
......@@ -369,6 +369,16 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output)
print(str(program))
def test_upsampling_bilinear2d(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
self.assertIsNotNone(output)
output = layers.upsampling_bilinear2d(x, scale=3)
self.assertIsNotNone(output)
print(str(program))
if __name__ == '__main__':
unittest.main()
......@@ -63,7 +63,10 @@ class TestOperator(unittest.TestCase):
self.assertEqual(mul_op.output("Out"), ["mul.out"])
self.assertEqual(
set(mul_op.attr_names),
set(["x_num_col_dims", "y_num_col_dims", "use_mkldnn"]))
set([
"x_num_col_dims", "y_num_col_dims", "use_mkldnn", "op_role",
"op_role_var"
]))
self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import unittest
import paddle.fluid as fluid
import paddle
import paddle.dataset.mnist as mnist
import paddle.dataset.wmt16 as wmt16
def simple_fc_net(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=['./mnist.recordio'],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in xrange(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=['mnist.recordio'],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in xrange(1):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def squeeze_excitation(input, num_channels, reduction_ratio):
# pool = fluid.layers.pool2d(
# input=input, pool_size=0, pool_type='avg', global_pooling=True)
conv = input
shape = conv.shape
reshape = fluid.layers.reshape(
x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
pool = fluid.layers.reduce_mean(input=reshape, dim=2)
squeeze = fluid.layers.fc(input=pool,
size=num_channels / reduction_ratio,
act='relu')
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid')
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
def shortcut(input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out:
if stride == 1:
filter_size = 1
else:
filter_size = 3
return conv_bn_layer(input, ch_out, filter_size, stride)
else:
return input
def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
# The number of first 1x1 convolutional channels for each bottleneck build block
# was halved to reduce the compution cost.
conv0 = conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = conv_bn_layer(
input=conv0,
num_filters=num_filters * 2,
filter_size=3,
stride=stride,
groups=cardinality,
act='relu')
conv2 = conv_bn_layer(
input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
scale = squeeze_excitation(
input=conv2,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
short = shortcut(input, num_filters * 2, stride)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def SE_ResNeXt50Small(batch_size=2, use_feed=False):
assert not use_feed, "SE_ResNeXt doesn't support feed yet"
img = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
conv = conv_bn_layer(
input=img, num_filters=16, filter_size=3, stride=2, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
for block in range(len(depth)):
for i in range(depth[block]):
conv = bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
cardinality=cardinality,
reduction_ratio=reduction_ratio)
shape = conv.shape
reshape = fluid.layers.reshape(
x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
pool = fluid.layers.reduce_mean(input=reshape, dim=2)
dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
# Classifier layer:
prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
import time
class TestParallelExecutorBase(unittest.TestCase):
def check_network_convergence(self,
method,
memory_opt=True,
iter=50,
batch_size=None,
allow_op_delay=False,
feed_dict=None,
seed=None,
use_parallel_executor=True,
balance_parameter_opt_between_cards=False):
def run_executor(exe, feed, fetch_list, program=None):
if isinstance(exe, fluid.ParallelExecutor):
res = exe.run(fetch_list=fetch_list, feed=feed)
elif isinstance(exe, fluid.Executor):
if program is None:
program = fluid.default_main_program()
res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
else:
raise ValueError('Unkown type exe')
return res
main = fluid.Program()
startup = fluid.Program()
startup.random_seed = 1 # Fix random seed
with fluid.program_guard(main, startup):
if seed is not None:
startup.random_seed = seed
loss = method(use_feed=feed_dict is not None)
adam = fluid.optimizer.Adam()
adam.minimize(loss)
if memory_opt:
fluid.memory_optimize(main)
place = fluid.CUDAPlace(0)
startup_exe = fluid.Executor(place)
startup_exe.run(startup)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.allow_op_delay = allow_op_delay
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
if use_parallel_executor:
exe = fluid.ParallelExecutor(
True,
loss_name=loss.name,
exec_strategy=exec_strategy,
build_strategy=build_strategy)
else:
exe = fluid.Executor(place=place)
if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count()
begin = time.time()
first_loss, = run_executor(
exe=exe, feed=feed_dict, fetch_list=[loss.name])
first_loss = np.array(first_loss)
for i in xrange(iter):
run_executor(exe=exe, feed=feed_dict, fetch_list=[])
last_loss, = run_executor(
exe=exe, feed=feed_dict, fetch_list=[loss.name])
end = time.time()
if batch_size is not None:
print "%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin))
last_loss = np.array(last_loss)
print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist.recordio', reader, feeder)
def check_simple_fc_convergence(self, balance_parameter_opt_between_cards):
self.check_network_convergence(simple_fc_net)
self.check_network_convergence(simple_fc_net, allow_op_delay=True)
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
def test_simple_fc(self):
self.check_simple_fc_convergence(False)
def test_simple_fc_with_new_strategy(self):
self.check_simple_fc_convergence(True)
def check_simple_fc_parallel_accuracy(self,
balance_parameter_opt_between_cards):
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1000,
feed_dict={"image": img,
"label": label},
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1000,
feed_dict={"image": img,
"label": label},
use_parallel_executor=True,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
for p_f in parallel_first_loss:
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
for p_l in parallel_last_loss:
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(False)
def test_simple_fc_parallel_accuracy_with_new_strategy(self):
self.check_simple_fc_parallel_accuracy(True)
def check_batchnorm_fc_convergence(self,
balance_parameter_opt_between_cards):
self.check_network_convergence(fc_with_batchnorm)
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence(False)
def test_batchnorm_fc_with_new_strategy(self):
self.check_batchnorm_fc_convergence(True)
class TestResnet(TestParallelExecutorBase):
# @classmethod
# def setUpClass(cls):
# # import os
# # if os.path.exists('./flowers.recordio'):
# # return
# with fluid.program_guard(fluid.Program(), fluid.Program()):
# reader = paddle.batch(flowers.train(), batch_size=4)
# feeder = fluid.DataFeeder(
# feed_list=[
# fluid.layers.data(
# name='image', shape=[3, 224, 224]),
# fluid.layers.data(
# name='label', shape=[1], dtype='int64'),
# ],
# place=fluid.CPUPlace())
# fluid.recordio_writer.convert_reader_to_recordio_file(
# "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
def check_resnet_convergence(self, balance_parameter_opt_between_cards):
import functools
batch_size = 2
self.check_network_convergence(
functools.partial(
SE_ResNeXt50Small, batch_size=batch_size),
iter=20,
batch_size=batch_size,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
def test_resnet(self):
self.check_resnet_convergence(False)
def test_resnet_with_new_strategy(self):
self.check_resnet_convergence(True)
class ModelHyperParams(object):
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
src_vocab_size = 10000
# index for <pad> token in source language.
src_pad_idx = src_vocab_size
# size of target word dictionay
trg_vocab_size = 10000
# index for <pad> token in target language.
trg_pad_idx = trg_vocab_size
# position value corresponding to the <pad> token.
pos_pad_idx = 0
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length = 50
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 1024
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rate used by all dropout layers.
dropout = 0.1
def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
def __pad_batch_data(insts,
pad_idx,
is_target=False,
return_pos=True,
return_attn_bias=True,
return_max_len=True):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
inst_data = np.array(
[inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, 1])]
if return_pos:
inst_pos = np.array([[
pos_i + 1 if w_i != pad_idx else 0
for pos_i, w_i in enumerate(inst)
] for inst in inst_data])
return_list += [inst_pos.astype("int64").reshape([-1, 1])]
if return_attn_bias:
if is_target:
# This is used to avoid attention on paddings and subsequent
# words.
slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
max_len))
slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
[-1, 1, max_len, max_len])
slf_attn_bias_data = np.tile(slf_attn_bias_data,
[1, n_head, 1, 1]) * [-1e9]
else:
# This is used to avoid attention on paddings.
slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
(max_len - len(inst))
for inst in insts])
slf_attn_bias_data = np.tile(
slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
[1, n_head, max_len, 1])
return_list += [slf_attn_bias_data.astype("float32")]
if return_max_len:
return_list += [max_len]
return return_list if len(return_list) > 1 else return_list[0]
def data_to_tensor(data_list, name_list, input_dict, place):
assert len(data_list) == len(name_list)
for i in range(len(name_list)):
tensor = fluid.LoDTensor()
tensor.set(data_list[i], place)
input_dict[name_list[i]] = tensor
src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
[inst[0] for inst in insts], src_pad_idx, is_target=False)
trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
[inst[1] for inst in insts], trg_pad_idx, is_target=True)
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, trg_max_len, 1]).astype("float32")
lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
False, False, False)
lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
return [
src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
]
import transformer_model
def transformer(use_feed):
assert not use_feed, "transfomer doesn't support feed yet"
return transformer_model.transformer(
ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer, ModelHyperParams.n_head,
ModelHyperParams.d_key, ModelHyperParams.d_value,
ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
class TestTransformer(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
reader = paddle.batch(
wmt16.train(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
batch_size=transformer_model.batch_size)
with fluid.recordio_writer.create_recordio_writer(
"./wmt16.recordio") as writer:
for batch in reader():
for tensor in prepare_batch_input(
batch, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
t = fluid.LoDTensor()
t.set(tensor, fluid.CPUPlace())
writer.append_tensor(t)
writer.complete_append_tensor()
@unittest.skip("transformer is buggy in multi gpu")
def test_main(self):
self.check_network_convergence(transformer)
class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def check_network_convergence(self, build_strategy=None):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net(True)
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=True,
loss_name=loss.name,
main_program=main,
build_strategy=build_strategy)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe,
build_strategy=build_strategy)
for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed=feed_dict)
test_loss = np.array(test_loss)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
train_loss = np.array(train_loss)
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
"Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss))
def test_parallel_testing(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(build_strategy)
def test_parallel_testing_with_new_strategy(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(build_strategy)
import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
embedding_name = 'emb'
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
is_sparse, **ignored):
# 8 features
predicate_embedding = fluid.layers.embedding(
input=predicate,
is_sparse=is_sparse,
size=[pred_dict_len, word_dim],
dtype='float32',
param_attr='vemb')
mark_embedding = fluid.layers.embedding(
input=mark,
is_sparse=is_sparse,
size=[mark_dict_len, mark_dim],
dtype='float32')
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [
fluid.layers.embedding(
size=[word_dict_len, word_dim],
is_sparse=is_sparse,
input=x,
param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
lstm_0 = fluid.layers.dynamic_lstm(
input=hidden_0,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
])
lstm = fluid.layers.dynamic_lstm(
input=mix_hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
])
return feature_out
class TestCRFModel(unittest.TestCase):
def check_network_convergence(self, is_sparse, build_strategy=None):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
word = fluid.layers.data(
name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
feature_out = db_lstm(**locals())
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=1e-1))
avg_cost = fluid.layers.mean(crf_cost)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.01,
decay_steps=100000,
decay_rate=0.5,
staircase=True))
sgd_optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=16)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
pe = fluid.ParallelExecutor(
use_cuda=True,
loss_name=avg_cost.name,
build_strategy=build_strategy)
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
mark, target
],
place=fluid.CPUPlace())
data = train_data()
for i in xrange(10):
cur_batch = next(data)
print map(np.array,
pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))[0]
def test_update_sparse_parameter_all_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy)
def test_update_dense_parameter_all_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy)
def test_update_sparse_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy)
def test_update_dense_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy)
# test fetch all the variables of global_block
import paddle.dataset.flowers as flowers
import math
def Lenet(data, class_dim):
conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None)
bn1 = fluid.layers.batch_norm(conv1, act='relu')
pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None)
bn2 = fluid.layers.batch_norm(conv2, act='relu')
pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
fc1 = fluid.layers.fc(pool2, size=500, act='relu')
fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
return fc2
class TestFetchOp(unittest.TestCase):
def parallel_exe(self, train_inputs, seed):
main = fluid.Program()
startup = fluid.Program()
startup.random_seed = seed
with fluid.program_guard(main, startup):
data = fluid.layers.data(
name='image', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = Lenet(data, class_dim=102)
loss = fluid.layers.cross_entropy(input=out, label=label)
loss = fluid.layers.mean(loss)
opt = fluid.optimizer.Momentum(
learning_rate=0.1,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
opt.minimize(loss)
# TODO(zcd): I found that onece the memory optimizer is open,
# parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
# conv2d_1.b_0@GRAD. Those variables should not be pruned.
# fluid.memory_optimize(main)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
pe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
fetch_list = []
all_vars = main.global_block().vars
for k, v in all_vars.iteritems():
if 'tmp' not in k and k[0] is not '_' or v.persistable:
fetch_list.append(k)
for data in train_inputs:
ret = pe.run(fetch_list, feed=feeder.feed(data))
for i in range(len(fetch_list)):
assert not math.isnan(np.sum(ret[i])) and \
not math.isinf(np.sum(ret[i]))
def test_fetch_op(self):
tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
tst_reader_iter = tst_reader()
iters = 3
train_inputs = []
for i in range(iters):
train_inputs.append(tst_reader_iter.next())
self.parallel_exe(train_inputs, seed=1)
class TestFeedParallel(unittest.TestCase):
def test_main(self):
main = fluid.Program()
startup = fluid.Program()
startup.random_seed = 1
with fluid.scope_guard(fluid.core.Scope()):
with fluid.program_guard(main, startup):
data = fluid.layers.data(
name='image', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
out = Lenet(data, class_dim=102)
loss = fluid.layers.cross_entropy(input=out, label=label)
loss = fluid.layers.mean(loss)
opt = fluid.optimizer.Momentum(
learning_rate=0.1,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
opt.minimize(loss)
place = fluid.CUDAPlace(0)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
reader = feeder.decorate_reader(
paddle.batch(
flowers.train(), batch_size=16), multi_devices=True)
exe = fluid.Executor(place)
exe.run(startup)
pe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
for batch_id, data in enumerate(reader()):
loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
print batch_id, loss_np
if batch_id == 2:
break
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid
import unittest
import paddle
import numpy as np
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
embedding_name = 'emb'
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
is_sparse, **ignored):
# 8 features
predicate_embedding = fluid.layers.embedding(
input=predicate,
is_sparse=is_sparse,
size=[pred_dict_len, word_dim],
dtype='float32',
param_attr='vemb')
mark_embedding = fluid.layers.embedding(
input=mark,
is_sparse=is_sparse,
size=[mark_dict_len, mark_dim],
dtype='float32')
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [
fluid.layers.embedding(
size=[word_dict_len, word_dim],
is_sparse=is_sparse,
input=x,
param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
lstm_0 = fluid.layers.dynamic_lstm(
input=hidden_0,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
])
lstm = fluid.layers.dynamic_lstm(
input=mix_hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
])
return feature_out
class TestCRFModel(unittest.TestCase):
def check_network_convergence(self, is_sparse, build_strategy=None):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
word = fluid.layers.data(
name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
feature_out = db_lstm(**locals())
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=1e-1))
avg_cost = fluid.layers.mean(crf_cost)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.01,
decay_steps=100000,
decay_rate=0.5,
staircase=True))
sgd_optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=16)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
pe = fluid.ParallelExecutor(
use_cuda=True,
loss_name=avg_cost.name,
build_strategy=build_strategy)
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
mark, target
],
place=fluid.CPUPlace())
data = train_data()
for i in xrange(10):
cur_batch = next(data)
print map(np.array,
pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))[0]
def test_update_sparse_parameter_all_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy)
def test_update_dense_parameter_all_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy)
def test_update_sparse_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy)
def test_update_dense_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.dataset.flowers as flowers
import math
import paddle.fluid as fluid
import unittest
import numpy as np
import paddle
def Lenet(data, class_dim):
conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None)
bn1 = fluid.layers.batch_norm(conv1, act='relu')
pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None)
bn2 = fluid.layers.batch_norm(conv2, act='relu')
pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
fc1 = fluid.layers.fc(pool2, size=500, act='relu')
fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
return fc2
class TestFetchOp(unittest.TestCase):
def parallel_exe(self, train_inputs, seed):
main = fluid.Program()
startup = fluid.Program()
startup.random_seed = seed
with fluid.program_guard(main, startup):
data = fluid.layers.data(
name='image', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = Lenet(data, class_dim=102)
loss = fluid.layers.cross_entropy(input=out, label=label)
loss = fluid.layers.mean(loss)
opt = fluid.optimizer.Momentum(
learning_rate=0.1,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
opt.minimize(loss)
# TODO(zcd): I found that onece the memory optimizer is open,
# parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
# conv2d_1.b_0@GRAD. Those variables should not be pruned.
# fluid.memory_optimize(main)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
pe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
fetch_list = []
all_vars = main.global_block().vars
for k, v in all_vars.iteritems():
if 'tmp' not in k and k[0] is not '_' or v.persistable:
fetch_list.append(k)
for data in train_inputs:
ret = pe.run(fetch_list, feed=feeder.feed(data))
for i in range(len(fetch_list)):
assert not math.isnan(np.sum(ret[i])) and \
not math.isinf(np.sum(ret[i]))
def test_fetch_op(self):
tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
tst_reader_iter = tst_reader()
iters = 3
train_inputs = []
for i in range(iters):
train_inputs.append(tst_reader_iter.next())
self.parallel_exe(train_inputs, seed=1)
class TestFeedParallel(unittest.TestCase):
def test_main(self):
main = fluid.Program()
startup = fluid.Program()
startup.random_seed = 1
with fluid.scope_guard(fluid.core.Scope()):
with fluid.program_guard(main, startup):
data = fluid.layers.data(
name='image', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
out = Lenet(data, class_dim=102)
loss = fluid.layers.cross_entropy(input=out, label=label)
loss = fluid.layers.mean(loss)
opt = fluid.optimizer.Momentum(
learning_rate=0.1,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
opt.minimize(loss)
place = fluid.CUDAPlace(0)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
reader = feeder.decorate_reader(
paddle.batch(
flowers.train(), batch_size=16), multi_devices=True)
exe = fluid.Executor(place)
exe.run(startup)
pe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
for batch_id, data in enumerate(reader()):
loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
print batch_id, loss_np
if batch_id == 2:
break
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
def simple_fc_net(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in xrange(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in xrange(1):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder)
def check_simple_fc_convergence(self, balance_parameter_opt_between_cards):
self.check_network_convergence(simple_fc_net)
self.check_network_convergence(simple_fc_net, allow_op_delay=True)
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
def test_simple_fc(self):
self.check_simple_fc_convergence(False)
def test_simple_fc_with_new_strategy(self):
self.check_simple_fc_convergence(True)
def check_simple_fc_parallel_accuracy(self,
balance_parameter_opt_between_cards):
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1000,
feed_dict={"image": img,
"label": label},
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1000,
feed_dict={"image": img,
"label": label},
use_parallel_executor=True,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
for p_f in parallel_first_loss:
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
for p_l in parallel_last_loss:
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(False)
def test_simple_fc_parallel_accuracy_with_new_strategy(self):
self.check_simple_fc_parallel_accuracy(True)
def check_batchnorm_fc_convergence(self,
balance_parameter_opt_between_cards):
self.check_network_convergence(fc_with_batchnorm)
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence(False)
def test_batchnorm_fc_with_new_strategy(self):
self.check_batchnorm_fc_convergence(True)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from parallel_executor_test_base import TestParallelExecutorBase
import unittest
def squeeze_excitation(input, num_channels, reduction_ratio):
# pool = fluid.layers.pool2d(
# input=input, pool_size=0, pool_type='avg', global_pooling=True)
conv = input
shape = conv.shape
reshape = fluid.layers.reshape(
x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
pool = fluid.layers.reduce_mean(input=reshape, dim=2)
squeeze = fluid.layers.fc(input=pool,
size=num_channels / reduction_ratio,
act='relu')
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid')
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
def shortcut(input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out:
if stride == 1:
filter_size = 1
else:
filter_size = 3
return conv_bn_layer(input, ch_out, filter_size, stride)
else:
return input
def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
# The number of first 1x1 convolutional channels for each bottleneck build block
# was halved to reduce the compution cost.
conv0 = conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = conv_bn_layer(
input=conv0,
num_filters=num_filters * 2,
filter_size=3,
stride=stride,
groups=cardinality,
act='relu')
conv2 = conv_bn_layer(
input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
scale = squeeze_excitation(
input=conv2,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
short = shortcut(input, num_filters * 2, stride)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def SE_ResNeXt50Small(batch_size=2, use_feed=False):
assert not use_feed, "SE_ResNeXt doesn't support feed yet"
img = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
conv = conv_bn_layer(
input=img, num_filters=16, filter_size=3, stride=2, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
for block in range(len(depth)):
for i in range(depth[block]):
conv = bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
cardinality=cardinality,
reduction_ratio=reduction_ratio)
shape = conv.shape
reshape = fluid.layers.reshape(
x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
pool = fluid.layers.reduce_mean(input=reshape, dim=2)
dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
# Classifier layer:
prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestResnet(TestParallelExecutorBase):
def check_resnet_convergence(self, balance_parameter_opt_between_cards):
import functools
batch_size = 2
self.check_network_convergence(
functools.partial(
SE_ResNeXt50Small, batch_size=batch_size),
iter=20,
batch_size=batch_size,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
)
def test_resnet(self):
self.check_resnet_convergence(False)
def test_resnet_with_new_strategy(self):
self.check_resnet_convergence(True)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import numpy as np
import unittest
def simple_fc_net():
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in xrange(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def check_network_convergence(self, build_strategy=None):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net()
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=True,
loss_name=loss.name,
main_program=main,
build_strategy=build_strategy)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe,
build_strategy=build_strategy)
for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed=feed_dict)
test_loss = np.array(test_loss)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
train_loss = np.array(train_loss)
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
"Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss))
def test_parallel_testing(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(build_strategy)
def test_parallel_testing_with_new_strategy(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(build_strategy)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import transformer_model
import numpy as np
from parallel_executor_test_base import TestParallelExecutorBase
import unittest
import paddle
import paddle.dataset.wmt16 as wmt16
WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
class ModelHyperParams(object):
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
src_vocab_size = 10000
# index for <pad> token in source language.
src_pad_idx = src_vocab_size
# size of target word dictionay
trg_vocab_size = 10000
# index for <pad> token in target language.
trg_pad_idx = trg_vocab_size
# position value corresponding to the <pad> token.
pos_pad_idx = 0
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length = 50
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 1024
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rate used by all dropout layers.
dropout = 0.1
def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
def __pad_batch_data(insts,
pad_idx,
is_target=False,
return_pos=True,
return_attn_bias=True,
return_max_len=True):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
inst_data = np.array(
[inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, 1])]
if return_pos:
inst_pos = np.array([[
pos_i + 1 if w_i != pad_idx else 0
for pos_i, w_i in enumerate(inst)
] for inst in inst_data])
return_list += [inst_pos.astype("int64").reshape([-1, 1])]
if return_attn_bias:
if is_target:
# This is used to avoid attention on paddings and subsequent
# words.
slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
max_len))
slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
[-1, 1, max_len, max_len])
slf_attn_bias_data = np.tile(slf_attn_bias_data,
[1, n_head, 1, 1]) * [-1e9]
else:
# This is used to avoid attention on paddings.
slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
(max_len - len(inst))
for inst in insts])
slf_attn_bias_data = np.tile(
slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
[1, n_head, max_len, 1])
return_list += [slf_attn_bias_data.astype("float32")]
if return_max_len:
return_list += [max_len]
return return_list if len(return_list) > 1 else return_list[0]
src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
[inst[0] for inst in insts], src_pad_idx, is_target=False)
trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
[inst[1] for inst in insts], trg_pad_idx, is_target=True)
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, trg_max_len, 1]).astype("float32")
lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
False, False, False)
lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
return [
src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
]
def transformer(use_feed):
assert not use_feed, "transfomer doesn't support feed yet"
return transformer_model.transformer(
ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer, ModelHyperParams.n_head,
ModelHyperParams.d_key, ModelHyperParams.d_value,
ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
class TestTransformer(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
reader = paddle.batch(
wmt16.train(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
batch_size=transformer_model.batch_size)
with fluid.recordio_writer.create_recordio_writer(
WMT16_RECORDIO_FILE) as writer:
for batch in reader():
for tensor in prepare_batch_input(
batch, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
t = fluid.LoDTensor()
t.set(tensor, fluid.CPUPlace())
writer.append_tensor(t)
writer.complete_append_tensor()
@unittest.skip("transformer is buggy in multi gpu")
def test_main(self):
self.check_network_convergence(transformer)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
def PolygonBoxRestore(input):
shape = input.shape
batch_size = shape[0]
geo_channels = shape[1]
h = shape[2]
w = shape[3]
h_indexes = np.array(range(h) * w).reshape(
[w, h]).transpose()[np.newaxis, :] # [1, h, w]
w_indexes = np.array(range(w) * h).reshape(
[h, w])[np.newaxis, :] # [1, h, w]
indexes = np.concatenate(
(w_indexes, h_indexes))[np.newaxis, :] # [1, 2, h, w]
indexes = indexes.repeat(
[geo_channels / 2],
axis=0)[np.newaxis, :] # [1, geo_channels/2, 2, h, w]
indexes = indexes.repeat(
[batch_size], axis=0) # [batch_size, geo_channels/2, 2, h, w]
return indexes.reshape(
input.shape) - input # [batch_size, geo_channels, h, w]
class TestPolygonBoxRestoreOp(OpTest):
def config(self):
self.input_shape = (1, 8, 2, 2)
def setUp(self):
self.config()
self.op_type = "polygon_box_transform"
input = np.random.random(self.input_shape).astype("float32")
self.inputs = {'Input': input}
output = PolygonBoxRestore(input)
self.outputs = {'Output': output}
def test_check_output(self):
self.check_output()
class TestCase1(TestPolygonBoxRestoreOp):
def config(self):
self.input_shape = (2, 10, 3, 2)
class TestCase2(TestPolygonBoxRestoreOp):
def config(self):
self.input_shape = (3, 12, 4, 5)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle.v2.dataset.mnist as mnist
class TestPreprocessor(unittest.TestCase):
def setUp(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=32)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist_for_preprocessor_test.recordio', reader, feeder)
def test_main(self):
N = 10
img_expected_res = []
lbl_expected_res = []
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.io.open_recordio_file(
'./mnist_for_preprocessor_test.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
img, lbl = fluid.layers.io.read_file(data_file)
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for _ in range(N):
img_v, lbl_v = exe.run(fetch_list=[img, lbl])
img_expected_res.append(img_v / 2)
lbl_expected_res.append(lbl_v + 1)
img_actual_res = []
lbl_actual_res = []
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.io.open_recordio_file(
'./mnist_for_preprocessor_test.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
with preprocessor.block():
img, lbl = preprocessor.inputs()
img_out = img / 2
lbl_out = lbl + 1
preprocessor.outputs(img_out, lbl_out)
data_file = fluid.layers.io.double_buffer(preprocessor())
img, lbl = fluid.layers.io.read_file(data_file)
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for _ in range(N):
img_v, lbl_v = exe.run(fetch_list=[img, lbl])
img_actual_res.append(img_v)
lbl_actual_res.append(lbl_v)
for idx in range(N):
np.allclose(img_expected_res[idx], img_actual_res[idx])
np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
......@@ -107,7 +107,7 @@ class ControlFlowGraph(object):
# Repeatedly apply liveness updates until the algorithm stablize
# on a complete set live input vars and live output vars.
while True:
for i in range(self.op_size, 0, -1):
for i in reversed(range(self.op_size)):
live_in[i] = set(self._live_in[i])
live_out[i] = set(self._live_out[i])
for s in self._successors[i]:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册