diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6140340890c0e5025eb08209e8ea78df918b4dc0..eeda759ff18ccb86ce6a585fe41cb972ea3ae295 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,6 +34,14 @@ repos:
entry: bash ./tools/codestyle/cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+- repo: local
+ hooks:
+ - id: pylint-doc-string
+ name: pylint
+ description: Check python docstring style using docstring_checker.
+ entry: bash ./tools/codestyle/pylint_pre_commit.hook
+ language: system
+ files: \.(py)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
diff --git a/.travis.yml b/.travis.yml
index 3391e2c3cab9938c9dc5705b51367c707d3bbe9d..8c772030925dcad3909f142b08e4d8057a3f89b7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,8 @@ env:
addons:
ssh_known_hosts: 13.229.163.131
before_install:
+ # For pylint dockstring checker
+ - sudo pip install pylint pytest astroid isort
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 710b4774ca021c2e916460e7253d4fbf979a38cc..cfaab206e1f321a55119d4a8d65c4a99d3819fff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,7 +57,10 @@ option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
+option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
+option(WITH_CONTRIB "Compile the third-party contributation" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
@@ -202,7 +205,7 @@ endif(USE_NNPACK)
add_subdirectory(proto)
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
# "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer.
add_subdirectory(paddle/optimizer)
@@ -230,3 +233,7 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
+
+if (WITH_CONTRIB)
+ add_subdirectory(paddle/contrib)
+endif()
diff --git a/Dockerfile b/Dockerfile
index ea39efd00bb5c0a7deb3f6d57083d83a673b883c..80a96983ec1ca6b9ec440f7e95de6c328eb1ed40 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -79,6 +79,9 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python
+#For docstring checker
+RUN pip install pylint pytest astroid isort
+
COPY ./python/requirements.txt /root/
RUN pip install -r /root/requirements.txt
@@ -101,6 +104,3 @@ RUN echo 'root:root' | chpasswd
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
EXPOSE 22
-
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/Dockerfile.android b/Dockerfile.android
index 848a7eba6f1421432addae8acff407b611adb4ae..48db2efea21a648657e3f490c95429b9a29ede52 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index 64816098a524f064ec12474a736cd4c721227a70..0000000000000000000000000000000000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
- - Kubernetes: v1.6.2
- - Linux Kernel: v3.10.0
-
-- Resource
- - CPU: 10 Cores per Pod
- - Memory: 5GB per Pod
-
-- Docker Image
-
- We use different base Docker Image to run the benchmark on Kubernetes:
- - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
- - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
- - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
- vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
- - Batch Size of training data.
- - PServer count of the training job.
- - The number of trainers.
-
-- Invariant
- - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-
-
-
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-
-
-
-
-PServer Count |
-10 |
-20 |
-40 |
-60 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-
-
-
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-
-
-
-Trainer Counter |
-1 |
-10 |
-20 |
-30 |
-40 |
-50 |
-60 |
-70 |
-80 |
-90 |
-100 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-
-
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-# so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index d56a912b9b03986e32693363f82df05a34b779e9..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz : 2101.000
-- cache size : 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
- 15.44 |
- 16.32 |
- 16.74 |
- 16.79 |
-
-
-PaddlePaddle v2 |
- 15.97 |
- 17.04 |
- 17.60 |
- 17.83 |
-
-
-TensorFlow |
- 9.09 |
- 9.10 |
- 9.24 |
- 8.66 |
-
-
-
-
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
- 190.20 |
- 222.15 |
- 247.40 |
- 258.18 |
-
-
-PaddlePaddle v2 |
- 170.96 |
- 233.71 |
- 256.14 |
- 329.23 |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-
-
-
-Trainer Count |
-20 |
-40 |
-80 |
-100 |
-
-
-
-
- PaddlePaddle Fluid |
- 263.29 (78.64%) |
- 518.80 (77.47%) |
- 836.26 (62.44%) |
- 1019.29 (60.89%) |
-
-
-PaddlePaddle v2 (need more tests) |
- 326.85 (92.85%) |
- 534.58 (75.93%) |
- 853.30 (60.60%) |
- 1041.99 (59.20%) |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-
-
-
-PServer Count |
-3 |
-6 |
-10 |
-20 |
-
-
-
-
- PaddlePaddle Fluid(should fix in next PR) |
- 589.1 |
- 592.6 |
- 656.4 |
- 655.8 |
-
-
-PaddlePaddle v2 (need more tests) |
- 593.4 |
- 791.3 |
- 729.7 |
- 821.7 |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16job-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- paddle-job-pserver: vgg16job
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16job
- - name: MKL_NUM_THREADS
- value: "1"
- - name: TRAINING_ROLE
- value: "PSERVER"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- command: ["paddle_k8s", "start_fluid"]
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16job-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- paddle-job: vgg16job
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- command: ["paddle_k8s", "start_fluid"]
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16job
- - name: TRAINING_ROLE
- value: "TRAINER"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh
deleted file mode 100644
index 8c0501439e9d5fa175f5aa9b62d286e690a10904..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
- ret=$1
- stdbuf -oL echo "job returned $ret...setting pod return message..."
- stdbuf -oL echo "==============================="
-
- if [ $ret -eq 136 ] ; then
- echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
- elif [ $ret -eq 139 ] ; then
- echo "Segmentation Fault" > /dev/termination-log
- elif [ $ret -eq 1 ] ; then
- echo "General Error" > /dev/termination-log
- elif [ $ret -eq 134 ] ; then
- echo "Program Abort" > /dev/termination-log
- fi
- stdbuf -oL echo "termination log wroted..."
- exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
- pserver_label="tf-job-pserver=${JOB_NAME}"
- trainer_label="tf-job-trainer=${JOB_NAME}"
-
- stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
- stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
- g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
- g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
- wait_running_pods
-
- label="tf-job-pserver=${JOB_NAME}"
- pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
- cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
- --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
- stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
- wait_running_pods
-
- label="tf-job-trainer=${JOB_NAME}"
- trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
- cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
- --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
- stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
- check_trainer_ret $?
-}
-
-start_tf(){
- if [[ "${TF_JOB_NAME}" == "worker" ]]; then
- start_tf_trainer
- else
- start_tf_pserver
- fi
-}
-
-usage() {
- echo "usage: tf_k8s []:"
- echo " start_tf Start tensorflow jobs"
-}
-
-case "$1" in
- start_tf)
- start_tf
- ;;
- --help)
- usage
- ;;
- *)
- usage
- ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16job-tf-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- tf-job-pserver: vgg16job-tf
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
- imagePullPolicy: Always
- command: ["tf_k8s", "start_tf"]
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PORT
- value: "32036"
- - name: ENTRY
- value: "python vgg16_tf.py"
- - name: JOB_NAME
- value: vgg16job-tf
- - name: PSERVERS_NUM
- value: "10"
- - name: TF_JOB_NAME
- value: "ps"
- - name: TRAINERS_NUM
- value: "20"
- - name: BATCH_SIZE
- value: "128"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: NUM_PASSES
- value: "1"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16job-tf-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- tf-job-trainer: vgg16job-tf
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
- imagePullPolicy: Always
- command: ["tf_k8s", "start_tf"]
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PORT
- value: "32036"
- - name: JOB_NAME
- value: vgg16job-tf
- - name: TF_JOB_NAME
- value: "worker"
- - name: ENTRY
- value: "python vgg16_tf.py"
- - name: PSERVERS_NUM
- value: "10"
- - name: BATCH_SIZE
- value: "128"
- - name: TRAINERS_NUM
- value: "20"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: NUM_PASSES
- value: "1"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16v2job-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- paddle-job-pserver: vgg16v2job
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16v2job
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "python train.py"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- command: ["paddle_k8s", "start_pserver"]
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16v2job-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- paddle-job: vgg16v2job
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- command: ["paddle_k8s", "start_trainer", "v2"]
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16v2job
- - name: BATCH_SIZE
- value: "256"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "2"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index e9360ab4c79d23bdf9f84d0c0d407af6d39bde3e..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
- if v.lower() in ('yes', 'true', 't', 'y', '1'):
- return True
- elif v.lower() in ('no', 'false', 'f', 'n', '0'):
- return False
- else:
- raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- '--batch_size', type=int, default=16, help="Batch size for training.")
-parser.add_argument(
- '--learning_rate',
- type=float,
- default=1e-3,
- help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
- '--device',
- type=str,
- default='CPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
- '--data_format',
- type=str,
- default='NCHW',
- choices=['NCHW', 'NHWC'],
- help='The data order, now only support NCHW.')
-parser.add_argument(
- '--data_set',
- type=str,
- default='flowers',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
-parser.add_argument(
- '--local',
- type=str2bool,
- default=True,
- help='Whether to run as local mode.')
-
-parser.add_argument(
- "--ps_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--trainer_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
- "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
- def conv_block(input, num_filter, groups, dropouts):
- return fluid.nets.img_conv_group(
- input=input,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act='relu',
- conv_with_batchnorm=True,
- conv_batchnorm_drop_rate=dropouts,
- pool_type='max')
-
- conv1 = conv_block(input, 64, 2, [0.3, 0])
- conv2 = conv_block(conv1, 128, 2, [0.4, 0])
- conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
- conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
- conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
- drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
- fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
- bn = fluid.layers.batch_norm(input=fc1, act='relu')
- drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
- fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
- return fc2
-
-
-def main():
- if args.data_set == "cifar10":
- classdim = 10
- if args.data_format == 'NCHW':
- data_shape = [3, 32, 32]
- else:
- data_shape = [32, 32, 3]
- else:
- classdim = 102
- if args.data_format == 'NCHW':
- data_shape = [3, 224, 224]
- else:
- data_shape = [224, 224, 3]
-
- # Input data
- images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
- label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
- # Train program
- net = vgg16_bn_drop(images)
- predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- # Evaluator
- batch_size = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size)
-
- # inference program
- inference_program = fluid.default_main_program().clone()
- with fluid.program_guard(inference_program):
- inference_program = fluid.io.get_inference_program(batch_acc)
-
- # Optimization
- optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
- optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
- # Initialize executor
- place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
- args.device_id)
- exe = fluid.Executor(place)
-
- # test
- def test(exe):
- test_pass_acc = fluid.average.WeightedAverage()
- for batch_id, data in enumerate(test_reader()):
- img_data = np.array(map(lambda x: x[0].reshape(data_shape),
- data)).astype("float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- outs = exe.run(inference_program,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[batch_acc, batch_size])
- test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
- return test_pass_acc.eval()
-
- def train_loop(exe, trainer_prog):
- iters = 0
- ts = time.time()
- train_pass_acc = fluid.average.WeightedAverage()
- for pass_id in range(args.num_passes):
- # train
- start_time = time.time()
- num_samples = 0
- train_pass_acc.reset()
-
- def run_step(batch_id, data):
- img_data = np.array(
- map(lambda x: x[0].reshape(data_shape), data)).astype(
- "float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- loss, acc, b_size = exe.run(
- trainer_prog,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[avg_cost, batch_acc, batch_size])
- return loss, acc, b_size
-
- if args.profile:
- with profiler.profiler('All', 'total',
- '/tmp/profile_vgg_%d' % args.task_index):
- for batch_id, data in enumerate(train_reader()):
- if batch_id > 5: break
- run_step(batch_id, data)
-
- total_time = 0.0
- count = 0
- for batch_id, data in enumerate(train_reader()):
- ts = time.time()
- loss, acc, b_size = run_step(batch_id, data)
- iters += 1
- num_samples += len(data)
- train_pass_acc.add(value=acc, weight=b_size)
-
- duration = time.time() - ts
- total_time += duration
- count += len(data)
- print(
- "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
- "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
- len(data) / duration,
- count / total_time)
- ) # The accuracy is the accumulation of batches, but not the current batch.
-
- pass_elapsed = time.time() - start_time
- pass_train_acc = train_pass_acc.eval()
- pass_test_acc = test(exe)
- print("Task:%d Pass = %d, Training performance = %f imgs/s, "
- "Train accuracy = %f, Test accuracy = %f\n" %
- (args.task_index, pass_id, num_samples / pass_elapsed,
- pass_train_acc, pass_test_acc))
-
- if args.local:
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
- else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
- train_loop(exe, fluid.default_main_program())
- else:
- trainers = int(os.getenv("TRAINERS")) # total trainer count
- print("trainers total: ", trainers)
-
- training_role = os.getenv(
- "TRAINING_ROLE",
- "TRAINER") # get the training role: trainer/pserver
-
- t = fluid.DistributeTranspiler()
- t.transpile(
- trainer_id=args.task_index,
- pservers=args.ps_hosts,
- trainers=trainers)
-
- if training_role == "PSERVER":
- current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
- "PADDLE_INIT_PORT")
- if not current_endpoint:
- print("need env SERVER_ENDPOINT")
- exit(1)
- pserver_prog = t.get_pserver_program(current_endpoint)
- pserver_startup = t.get_startup_program(current_endpoint,
- pserver_prog)
- exe.run(pserver_startup)
- exe.run(pserver_prog)
- elif training_role == "TRAINER":
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
- else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
- paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
-
- trainer_prog = t.get_trainer_program()
- feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
- # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
- exe.run(fluid.default_startup_program())
- train_loop(exe, trainer_prog)
- else:
- print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
- print('----------- Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == "__main__":
- print_arguments()
- main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 2d220478acae46566760209dbc012cff316946aa..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
- '--learning_rate',
- type=float,
- default=1e-3,
- help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
- '--device',
- type=str,
- default='CPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument(
- '--data_format',
- type=str,
- default='NHWC',
- choices=['NCHW', 'NHWC'],
- help='The data order, NCHW=[batch, channels, height, width].'
- 'Only support NHWC right now.')
-parser.add_argument(
- '--data_set',
- type=str,
- default='cifar10',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
-
-parser.add_argument(
- "--ps_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--worker_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
- "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
- def __init__(self):
- self.parameters = []
-
- def batch_norm_relu(self, inputs, is_training):
- """Performs a batch normalization followed by a ReLU."""
- # We set fused=True for a significant speed boost. See
- # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
- inputs = tf.layers.batch_normalization(
- inputs=inputs,
- axis=1 if args.data_format == 'NCHW' else -1,
- momentum=0.9,
- epsilon=1e-05,
- center=True,
- scale=True,
- training=is_training,
- fused=True)
- inputs = tf.nn.relu(inputs)
- return inputs
-
- def conv_bn_layer(self,
- name,
- images,
- kernel_shape,
- is_training,
- drop_rate=0.0):
- with tf.name_scope(name) as scope:
- kernel = tf.Variable(
- tf.truncated_normal(
- kernel_shape, dtype=tf.float32, stddev=1e-1),
- name='weights')
- conv = tf.nn.conv2d(
- images,
- kernel, [1, 1, 1, 1],
- data_format=args.data_format,
- padding='SAME')
- biases = tf.Variable(
- tf.constant(
- 0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
- trainable=True,
- name='biases')
- out = tf.nn.bias_add(conv, biases)
- out = self.batch_norm_relu(out, is_training)
- out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
- return out
-
- def fc_layer(self, name, inputs, shape):
- with tf.name_scope(name) as scope:
- fc_w = tf.Variable(
- tf.truncated_normal(
- shape, dtype=tf.float32, stddev=1e-1),
- name='weights')
- fc_b = tf.Variable(
- tf.constant(
- 0.0, shape=[shape[-1]], dtype=tf.float32),
- trainable=True,
- name='biases')
- out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
- return out
-
- def network(self, images, class_dim, is_training):
- """ VGG16 model structure.
-
- TODO(kuke): enable this network to support the 'NCHW' data format
- """
-
- # conv1
- conv1_1 = self.conv_bn_layer(
- 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
- conv1_2 = self.conv_bn_layer(
- 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
- # pool1
- pool1 = tf.nn.max_pool(
- conv1_2,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool1')
- # conv2
- conv2_1 = self.conv_bn_layer(
- 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
- conv2_2 = self.conv_bn_layer(
- 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
- # pool2
- pool2 = tf.nn.max_pool(
- conv2_2,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool2')
- # conv3
- conv3_1 = self.conv_bn_layer(
- 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
- conv3_2 = self.conv_bn_layer(
- 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
- conv3_3 = self.conv_bn_layer(
- 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
- # pool3
- pool3 = tf.nn.max_pool(
- conv3_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool3')
- # conv4
- conv4_1 = self.conv_bn_layer(
- 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
- conv4_2 = self.conv_bn_layer(
- 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv4_3 = self.conv_bn_layer(
- 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
- # pool4
- pool4 = tf.nn.max_pool(
- conv4_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool4')
- # conv5
- conv5_1 = self.conv_bn_layer(
- 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv5_2 = self.conv_bn_layer(
- 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv5_3 = self.conv_bn_layer(
- 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
- # pool5
- pool5 = tf.nn.max_pool(
- conv5_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool4')
- # flatten
- shape = int(np.prod(pool5.get_shape()[1:]))
- pool5_flat = tf.reshape(pool5, [-1, shape])
- # fc1
- drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
- fc1 = self.fc_layer('fc1', drop, [shape, 512])
- # fc2
- bn = self.batch_norm_relu(fc1, is_training)
- drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
- fc2 = self.fc_layer('fc2', drop, [512, 512])
-
- fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
- return fc3
-
-
-def run_benchmark(cluster_spec, server):
- """Run benchmark on cifar10 or flowers."""
-
- if args.data_set == "cifar10":
- class_dim = 10
- raw_shape = (3, 32, 32)
- dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
- None, 3, 32, 32)
- else:
- class_dim = 102
- raw_shape = (3, 224, 224)
- dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
- None, 3, 224, 224)
-
- device = tf.train.replica_device_setter(
- worker_device="/job:worker/task:{}".format(args.task_index),
- cluster=cluster_spec)
-
- with tf.device(device):
- images = tf.placeholder(tf.float32, shape=dat_shape)
- labels = tf.placeholder(tf.int64, shape=(None, ))
- is_training = tf.placeholder('bool')
- onehot_labels = tf.one_hot(labels, depth=class_dim)
-
- vgg16 = VGG16Model()
- logits = vgg16.network(images, class_dim, is_training)
- loss = tf.losses.softmax_cross_entropy(
- onehot_labels=onehot_labels, logits=logits)
- avg_loss = tf.reduce_mean(loss)
-
- correct = tf.equal(tf.argmax(logits, 1), labels)
- accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
- optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
- update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
- global_step = tf.Variable(0, name='global_step', trainable=False)
- with tf.control_dependencies(update_ops):
- train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
- summary_op = tf.summary.merge_all()
- init_op = tf.global_variables_initializer()
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- buf_size=5120),
- batch_size=args.batch_size)
-
- # test
- def test():
- test_accs = []
- for batch_id, data in enumerate(test_reader()):
- test_images = np.array(
- map(lambda x: np.transpose(x[0].reshape(raw_shape),
- axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
- test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
- test_accs.append(
- accuracy.eval(feed_dict={
- images: test_images,
- labels: test_labels,
- is_training: False
- }))
- return np.mean(test_accs)
-
- config = tf.ConfigProto(
- intra_op_parallelism_threads=1,
- inter_op_parallelism_threads=1,
- log_device_placement=True)
- config.gpu_options.allow_growth = True
-
- hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
- with tf.train.MonitoredTrainingSession(
- master=server.target,
- is_chief=(args.task_index == 0),
- hooks=hooks,
- config=config) as sess:
- iters, num_samples, start_time = 0, 0, 0.0
- for pass_id in range(args.num_passes):
- # train
- num_samples = 0
- start_time = time.time()
- for batch_id, data in enumerate(train_reader()):
- train_images = np.array(
- map(lambda x: np.transpose(x[0].reshape(raw_shape),
- axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
- train_labels = np.array(map(lambda x: x[1], data)).astype(
- 'int64')
- iter_begin_time = time.time()
- _, loss, acc = sess.run([train_op, avg_loss, accuracy],
- feed_dict={
- images: train_images,
- labels: train_labels,
- is_training: True
- })
- iters += 1
- print(
- "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
- % (pass_id, iters, loss, acc,
- len(data) / (time.time() - iter_begin_time)))
- num_samples += len(data)
- train_elapsed = time.time() - start_time
- # test
- pass_test_acc = test()
- print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
- (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
- print('----------- Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == '__main__':
- print_arguments()
-
- ps_hosts = args.ps_hosts.split(",")
- worker_hosts = args.worker_hosts.split(",")
-
- # Create a cluster from the parameter server and worker hosts.
- cluster_spec = tf.train.ClusterSpec({
- "ps": ps_hosts,
- "worker": worker_hosts
- })
-
- # Create and start a server for the local task.
- server = tf.train.Server(
- cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
- if args.job_name == "ps":
- print("start pserver")
- server.join()
- elif args.job_name == "worker":
- print("start worker")
- run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
- BATCH_SIZE = int(BATCH_SIZE)
-else:
- BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
- def conv_block(input, num_filter, groups, num_channels=None):
- return paddle.networks.img_conv_group(
- input=input,
- num_channels=num_channels,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act=paddle.activation.Relu(),
- pool_type=paddle.pooling.Max())
-
- assert len(nums) == 5
- # the channel of input feature is 3
- conv1 = conv_block(input, 64, nums[0], 3)
- conv2 = conv_block(conv1, 128, nums[1])
- conv3 = conv_block(conv2, 256, nums[2])
- conv4 = conv_block(conv3, 512, nums[3])
- conv5 = conv_block(conv4, 512, nums[4])
-
- fc_dim = 512
- fc1 = paddle.layer.fc(input=conv5,
- size=fc_dim,
- act=paddle.activation.Relu(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5))
- fc2 = paddle.layer.fc(input=fc1,
- size=fc_dim,
- act=paddle.activation.Relu(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5))
- out = paddle.layer.fc(input=fc2,
- size=class_dim,
- act=paddle.activation.Softmax())
- return out
-
-
-def vgg13(input, class_dim):
- nums = [2, 2, 2, 2, 2]
- return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
- nums = [2, 2, 3, 3, 3]
- return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
- nums = [2, 2, 4, 4, 4]
- return vgg(input, nums, class_dim)
-
-
-def main():
- global ts
- paddle.init(use_gpu=False)
- image = paddle.layer.data(
- name="image", type=paddle.data_type.dense_vector(DATA_DIM))
- lbl = paddle.layer.data(
- name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
- extra_layers = None
- # NOTE: for v2 distributed training need averaging updates.
- learning_rate = 1e-3 / NODE_COUNT
- out = vgg16(image, class_dim=CLASS_DIM)
- cost = paddle.layer.classification_cost(input=out, label=lbl)
-
- # Create parameters
- parameters = paddle.parameters.create(cost)
-
- # Create optimizer
- optimizer = paddle.optimizer.Momentum(
- momentum=0.9,
- regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
- BATCH_SIZE),
- learning_rate=learning_rate / BATCH_SIZE,
- learning_rate_decay_a=0.1,
- learning_rate_decay_b=128000 * 35,
- learning_rate_schedule="discexp", )
-
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- cifar.train10(),
- # To use other data, replace the above line with:
- # reader.train_reader('train.list'),
- buf_size=1000),
- batch_size=BATCH_SIZE)
- test_reader = paddle.batch(
- cifar.test10(),
- # To use other data, replace the above line with:
- # reader.test_reader('val.list'),
- batch_size=BATCH_SIZE)
-
- # Create trainer
- trainer = paddle.trainer.SGD(cost=cost,
- parameters=parameters,
- update_equation=optimizer,
- extra_layers=extra_layers,
- is_local=False)
-
- # End batch and end pass event handler
- def event_handler(event):
- global ts, ts_pass
- if isinstance(event, paddle.event.BeginPass):
- ts_pass = time.time()
- if isinstance(event, paddle.event.BeginIteration):
- ts = time.time()
- if isinstance(event, paddle.event.EndIteration):
- if event.batch_id % 1 == 0:
- print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
- event.pass_id, event.batch_id, event.cost, event.metrics,
- time.time() - ts)
- if isinstance(event, paddle.event.EndPass):
- print "Pass %d end, spent: %f" % (event.pass_id,
- time.time() - ts_pass)
- result = trainer.test(reader=test_reader)
- print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
- trainer.train(
- reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
- main()
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 0fc02b704362f79f2219252538b4b3195e665b2c..065df2edb8d3152ab0891798628653d3b283f1df 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,22 +24,22 @@ Currently supported `--model` argument include:
* Run the following command to start a benchmark job locally:
```bash
- python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test
+ python fluid_benchmark.py --model mnist --device GPU
```
You can choose to use GPU/CPU training. With GPU training, you can specify
- `--parallel 1` to run multi GPU training.
+ `--gpus ` to run multi GPU training.
* Run distributed training with parameter servers:
* start parameter servers:
```bash
- PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+ PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
```
* start trainers:
```bash
- PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+ PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
```
* Run distributed training using NCCL2
```bash
- PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2
+ PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
```
## Run Distributed Benchmark on Kubernetes Cluster
@@ -48,7 +48,7 @@ We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submi
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
```
Then the yaml files are generated under directory `myjob`, you can run:
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 1d8f27440d0f1438e0520684ee3e90e8a5891a17..30b070e4acac60caa97a4e8ffd07462cb347ee93 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -94,6 +94,10 @@ def parse_args():
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
+ parser.add_argument(
+ '--use_fake_data',
+ action='store_true',
+ help='If set ommit the actual read data operators.')
parser.add_argument(
'--update_method',
type=str,
@@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe.run(train_prog)
return
+ if args.use_fake_data:
+ raise Exception(
+ "fake data is not supported in single GPU test for now.")
+
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_prog)
@@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_acc, args, train_prog, startup_prog, nccl_id_var,
num_trainers, trainer_id):
+ feed_var_list = [
+ var for var in train_prog.global_block().vars.itervalues()
+ if var.is_data
+ ]
+ # generate fake:
+ if args.use_fake_data:
+ for var in feed_var_list:
+ v = startup_prog.global_block().clone_variable(var)
+ var.persistable = True
+ v.persistable = True
+
+ real_shape = list(var.shape)
+ real_shape[0] = args.batch_size / args.gpus
+ startup_prog.global_block().append_op(
+ outputs={"Out": v},
+ type="fill_constant",
+ attrs={"shape": real_shape,
+ "value": 1.0,
+ "dtype": var.dtype})
+
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+ if nccl_id_var and trainer_id == 0:
+ #FIXME(wuyi): wait other trainer to start listening
+ time.sleep(30)
+
startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy()
@@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
exec_strategy=strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
- feed_var_list = [
- var for var in train_prog.global_block().vars.itervalues()
- if var.is_data
- ]
+
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in range(args.pass_num):
num_samples = 0
@@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_samples = 0
if iters == args.iterations:
break
- loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+ if args.use_fake_data:
+ loss, = exe.run([avg_loss.name])
+ else:
+ loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
num_samples += len(data)
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index 3dbb4b8c5dd13657f8d1853003b321ad047e1349..39ba207fd96f71563504017e77dc0e87c249b3f8 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -112,6 +112,7 @@ def gen_job():
envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+ envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
envs.append({
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
index b64a7f78ff10d03987ea4a8c13a0e34bb433f64c..2d09d940a5ee638e4b55405d05924e2d76006cfc 100644
--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -54,5 +54,13 @@ envs = [
"fieldPath": "status.podIP"
}
}
+ },
+ {
+ "name": "PADDLE_CURRENT_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ }
}
]
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e490397cc0624c310949a4b571bd00cac6e8953b..682614742cf1bd3130c638020a2545e16226d4d6 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
+if(EIGEN_USE_THREADS)
+ add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
+
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 0fde4373a4be58e71ff1a305bd4991cc554d7a34..2665996432b1f6681927320a85d6835094abe4cd 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
${OPTIONAL_ARGS}
-Dprotobuf_BUILD_TESTS=OFF
+ -DCMAKE_SKIP_RPATH=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 91449042fcdfd48c95f3dd3babf958c5d572e747..f53da4d194f8d2428b4121fa1bb31f3fc95a9f64 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1003,9 +1003,9 @@ dice_loss
.. autofunction:: paddle.fluid.layers.dice_loss
:noindex:
-bilinear_interp
+upsampling_bilinear2d
____
-.. autofunction:: paddle.fluid.layers.bilinear_interp
+.. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
:noindex:
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c0156c8e46378e7bbeea8072938b8ccfb9ab6d7
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid?
+
+---
+
+### 两个基础问题
+
+
+
+1. 如何描述机器学习模型和优化过程?
+ - 完备自洽,表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算?
+ - 支持异步设备、多卡、分布式计算
+ - 降低计算/计算优化的开发成本
+ - ……
+
+
+
+---
+
+### 如何描述模型和优化过程?
+
+
+
+
+
+
+ |
+一组连续执行的layers |
+variable和operator构成的计算图 |
+不再有模型的概念 |
+
+
+
+
+ 2013 |
+ Caffe,Theano, Torch, PaddlePaddle |
+ |
+ |
+
+
+
+ 2015 |
+ |
+ TensorFlow, MxNet, Caffe2, ONNX, n-graph |
+ |
+
+
+2016 |
+ |
+ |
+ PyTorch, TensorFlow Eager Execution, **==PaddlePaddle Fluid==** |
+
+
+
+
+
+---
+
+
+### 目标
+
+
+
+- 提高对各类机器学习任务的描述能力:能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰,各模块充分解耦:内外部贡献者能够专注于自己所需的功能模块,基于框架进行再次开发。
+- 从设计上,留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下,实现自动可伸缩,自动容错的分布式计算。
+
+
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- [编译器式的执行流程,区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+
+
+
+
+
+
+---
+
+#### 让我们在Fluid程序实例中,区分编译时和运行时
+
+---
+### Fluid 编译时
+
+
+
+- ==**定义前向计算**==
+
+ ```python
+ x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+ y_predict = fluid.layers.fc(input=x, size=1, act=None)
+ y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+ cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+ avg_cost = fluid.layers.mean(x=cost)
+ ```
+
+- ==**添加反向、正则、优化**==
+ ```python
+ learning_rate = 0.01
+ sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+ sgd_optimizer.minimize(avg_cost)
+ ```
+
+
+---
+
+### `Program` vs. 计算图
+
+
+
+- 在科学计算领域,计算图是一种描述计算的经典方式。下图展示了从前向计算图(蓝色)开始,通过添加反向(红色)和优化算法相关(绿色)操作,构建出整个计算图的过程:
+-
+
+
+
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成,相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算(这里可以先简单的理解为是一段有序的计算流)`Program`,为这段前向计算按照:前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序,添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+
+
+---
+
+### Fluid 运行时
+
+
+
+- ==**读入数据**==
+
+ ```python
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+ batch_size=20)
+ feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+ ```
+- ==**定义执行程序的设备**==
+ ```python
+ place = fluid.CPUPlace()
+ feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+ ```
+
+- ==创建执行器(Executor),执行初始化 `Program`和训练`Program`==
+
+ ```python
+ exe = fluid.Executor(place)
+ exe.run(fluid.default_startup_program())
+ PASS_NUM = 100
+ for pass_id in range(PASS_NUM):
+ for data in train_reader():
+ avg_loss_value, = exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+ print(avg_loss_value)
+ ```
+
+
+---
+
+### 总结:框架做什么?用户做什么?
+
+
+
+
+
+
+构建训练 |
+执行训练 |
+
+
+
+
+
+用户:描述前向运算 框架:添加反向运算 框架:添加优化运算 框架:添加内存优化 框架:添加并行/多设备/分布式相关的计算单元
+ |
+
+
+框架:创建Operator(计算)+ Variable(数据) 框架:创建`Block` 框架:内存管理/设备管理 框架:执行计算
+ |
+
+
+
+
+
+---
+
+### 总结:编译时
+
+
+**用户编写一段Python程序,描述模型的前向计算**
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状,进行静态检查:`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. (可选)添加多卡/多机相关的Operator,生成在多卡/多机上运行的程序
+
+
+
+---
+
+### 总结:运行时
+
+
+**执行规划好的计算**
+1. 创建`Executor`
+1. 为将要执行的一段计算,在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`,依次执行`Block`
+
+
+
+ Figure. 编译时运行时概览
+
+
+
+
+---
+
+## ==3==. 用户如何描述计算?
+---
+
+### Fluid:==像写程序一样==定义计算
+
+
+- 顺序执行
+ ```python
+ x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+ y_predict = fluid.layers.fc(input=x, size=1, act=None)
+ y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+ cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+ ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+ ```python
+ a = fluid.Var(10)
+ b = fluid.Var(0)
+
+ switch = fluid.switch()
+ with switch.block():
+ with switch.case(fluid.less_equal(a, 10)):
+ fluid.print("Case 1")
+ with switch.case(fluid.larger(a, 0)):
+ fluid.print("Case 2")
+ with switch.default():
+ fluid.print("Case 3")
+ ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+
+
+- 循环:[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+ ```python
+ d0 = layers.data("d0", shape=[10], dtype='float32')
+ data_array = layers.array_write(x=d0, i=i)
+ array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+ cond = layers.less_than(x=i, y=array_len)
+ while_op = layers.While(cond=cond)
+ with while_op.block():
+ d = layers.array_read(array=data_array, i=i)
+ i = layers.increment(x=i, in_place=True)
+ layers.array_write(result, i=i, array=d)
+ layers.less_than(x=i, y=array_len, cond=cond)
+ ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+
+
+---
+
+#### 总结
+
+
+
+1. 用户层提供的描述语法具有完备性、自洽性,有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言,认知能够直接迁移
+1. 能够支持:定义问题,逐步求解
+
+
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 :==变量和计算的描述==
+
+
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+ - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- 什么是 Fluid Program
+
+ - 在Fluid中,一个神经网络任务(训练/预测)被描述为一段`Program`
+ - `Program`包含对`Variable`(数据)和 `Operator`(对数据的操作)的描述
+ - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`,构成一段完整的`Fluid Program`
+
+
+>编译阶段最终,经过 Transpiler 的执行规划,变换处理,生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+
+
+---
+
+### 编译时概念 :==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+
+
+1. 接受一段`ProgramDesc`作为输入,生成一段新的`ProgramDesc`
+
+ - *Memory optimization transpiler*:向原始`ProgramDesc` 中插入 `FreeMemoryOps`,在一次迭代优化结束前提前释放内存,使得能够维持较小的 memory footprint
+
+ - *Distributed training transpiler*:将原始的`ProgramDesc`中转化为对应的分布式版本,生成两段新的`ProgramDesc`:
+ 1. trainer进程执行的`ProgramDesc`
+ 1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`,生成可直接被`gcc`, `nvcc`, `icc`等编译的代码,编译后得到可执行文件
+
+
+
+---
+### Transplier
+
+
+
+
+
+---
+
+### 打印 `ProgramDesc`
+
+
+
+
+
+
+
+- `default_startup_program`:创建可学习参数,对参数进行初始化
+- `default_main_program`:由用户定义的模型,包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+ ```python
+ from paddle.v2.fluid import debuger
+ print debuger.pprint_program_codes(framework.default_main_program().desc)
+ ```
+
+
+---
+### 输出效果
+
+
+
+
+
+variable in block 0 |
+variable in block 0 |
+
+
+
+ |
+ |
+
+
+
+
+
+---
+
+### 运行时概念
+
+
+
+- 数据相关
+ - `Tensor` / `LoDTensor` / `Variable`
+ - `Scope`
+
+- 计算相关
+ - `Block`
+ - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+
+
+ |
+protobuf messages |
+C++ class objects |
+
+
+
+Data |
+[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+ |
+[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+ |
+
+
+
+Operation |
+[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+ |
+[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+ |
+
+
+Block |
+BlockDesc
+ |
+Block
+ |
+
+
+
+
+
+
+- 执行相关 :`Executor`
+
+
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+
+
+- Tensor 是$n$-dimensional arry的推广,LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出,网络中的可学习参数全部统一使用LoDTensor(n-dimension array)表示
+- 一个mini-batch输入数据是一个LoDTensor
+ - 在Fluid中,RNN 处理变长序列无需padding,得益于 `LoDTensor`表示
+ - 可以简单将 LoD 理解为:`std::vector>`
+ - 对非序列数据,LoD 信息为空
+
+
+
+ |
+TensorFlow |
+PaddlePaddle |
+
+
+
+RNN |
+Support
+ |
+Support
+ |
+
+
+
+recursive RNN |
+Support
+ |
+Support
+ |
+
+
+padding zeros |
+Must
+ |
+No need
+ |
+
+blob data type |
+Tensor
+ |
+LODTensor
+ |
+
+
+
+
+
+
+
+---
+#### LoD 信息实例
+
+
+
+
+
+
+
+- 图(a)的LoD 信息
+ ```cpp
+ [0, 5, 8, 10, 14]
+ ```
+- 图(b)的 LoD 信息
+ ```cpp
+ [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+ ```
+
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+
+
+
+
+
+1. `Block` 是一个实现层的概念,不在应用层暴露给用户。目前用户无法自行创建并利用`Block`,用户能够感知的只有`Program`这个概念。
+1. 逻辑上,可以将 `Block` 类比为编程语言中的大括号:定义了一段作用域,其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`,`Block`是可嵌套的,因此`Scope`也是可嵌套的
+
+
+
+---
+### Executor
+
+
+
+
+
+接口 |
+说明 |
+
+
+
+
+
+ |
+输入 1. `ProgramDesc` 2. `Scope` 3.`block_id`
解释执行步骤 1. 创建所有 Variables 2. 逐一创建 Operator 并运行
+ |
+
+
+
+
+---
+### Operator/OpWithKernel/Kernel
+
+
+
+
+
+
+- operator 无状态,Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel:while_op 、ifelse op
+
+
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+
+
+
+
+Layer |
+Operator |
+
+
+
+
+
+ |
+
+
+ |
+
+
+
+1. 内部维护状态 2. 包含forward和backward方法 |
+1. 内部无状态 2. 只有Run方法 |
+
+
+
+
+
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间,最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算(Operators/Kernels)完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口:
+ ```cpp
+ template
+ void* Alloc(Place place, size_t size);
+
+ template
+ void Free(Place place, void* ptr);
+
+ template
+ size_t Used(Place place);
+
+ struct Usage : public boost::static_visitor {
+ size_t operator()(const platform::CPUPlace& cpu) const;
+ size_t operator()(const platform::CUDAPlace& gpu) const;
+ };
+ ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时,需特化支持的 `Place`, 提供以上三个接口的实现
+
+
+
+---
+### 代码结构
+
+
+
+内存管理模块可以理解为由以下两部分构成:
+
+1. SystemAllocator:实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator:内存管理算法
+
+
+
+---
+### System Allocator
+
+
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+ - 不同设备上的内存分配和回收终将转化为标准接口调用
+ - 为不同设备实现MemoryAllocator,继承自SystemAllocator
+
+ ```cpp
+ class SystemAllocator {
+ public:
+ virtual ~SystemAllocator() {}
+ virtual void* Alloc(size_t& index, size_t size) = 0;
+ virtual void Free(void* p, size_t size, size_t index) = 0;
+ virtual bool UseGpu() const = 0;
+ };
+ ```
+
+
+---
+
+### CPU/GPU Allocator
+
+
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+ virtual void* Alloc(size_t& index, size_t size);
+ virtual void Free(void* p, size_t size, size_t index);
+ virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+ virtual void* Alloc(size_t& index, size_t size);
+ virtual void Free(void* p, size_t size, size_t index);
+ virtual bool UseGpu() const;
+ private:
+ size_t gpu_alloc_size_ = 0;
+ size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator,分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后,将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+
+
+---
+### CPU Allocator
+
+
+
+- CPU 内存的分配提供两种选项:
+ 1. non-pinned memory:可分页内存
+ 2. pinned memory:页锁定内存
+ - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少,影响系统性能,默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+ ```cpp
+ DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+ "Default use 100% of CPU memory for PaddlePaddle,"
+ "reserve the rest for page tables, etc");
+ ```
+
+
+
+---
+### GPU Allocator
+
+
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+ - 如果可用显存小于请求分配大小,调用cudaMalloc进行分配
+ - 如果可用显存不足,目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小:
+
+ ```cpp
+ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+ "Default use 92% of GPU memory for PaddlePaddle,"
+ "reserve the rest for page tables, etc");
+ ```
+
+
+
+---
+#### 内存管理算法: [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+
+
+- Memory Arena:一次性分配大块连续内存,之后会基于这块内存进行内存管理:动态分配、释放、重用内存块。
+- 伙伴内存分配:
+ - 将内存划分为 2 的幂次方个分区,使用 best-fit 方法来分配内存请求。
+ - 当释放内存时,检查 buddy 块,查看相邻的内存块是否也已被释放。如果是,将内存块合并,以最小化内存碎片。
+ - 分配的内存在物理内存的自然边界对齐,提高内存访问效率。
+ - 算法的时间效率高,单使用 best-fit 方法的缘故,会产生一定的内存浪费
+
+
+
+---
+
+### Buddy Allocator
+
+
+
+- BuddyAllocator 是一个单例,每个设备(如: GPU/CPU(0)/GPU(1)) 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时,将会调用SystemAllocator去指定的设备上分配物理内存
+
+
+
+---
+### 实例:CPU 下内存管理接口的实现
+
+
+
+- 对上层应用,统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+ ```cpp
+ template <>
+ void* Alloc(platform::CPUPlace place, size_t size) {
+ VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+ void* p = GetCPUBuddyAllocator()->Alloc(size);
+ VLOG(10) << " pointer=" << p;
+ return p;
+ }
+
+ template <>
+ void Free(platform::CPUPlace place, void* p) {
+ VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+ GetCPUBuddyAllocator()->Free(p);
+ }
+
+ template <>
+ size_t Used(platform::CPUPlace place) {
+ return GetCPUBuddyAllocator()->Used();
+ }
+ ```
+
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持(一)
+
+
+
+- step 1:添加Place类型,由用户实现添加到框架
+ - 可以将Place类型理解为一个整数加上一个枚举型,包括:设备号 + 设备类型
+
+
+
+
+- DeviceContext
+ - 不同的Place会对应一个相应的DeviceContext,用于组织管理与设备相关的信息
+ - 例如,GpuDeviceContext中会管理Cuda stream
+ - 目前实现中一些特殊的库也会对应有自己的DeviceContext:例如:
+ ```cpp
+ class MKLDNNDeviceContext : public CPUDeviceContext {……}
+ ```
+ - 每种设备对应的DeviceContext需要管理的内容不尽相同,视具体需求来实现
+
+
+
+---
+
+### 多设备支持(二)
+
+
+
+- step 2: 增加KernelType,为相应的KernelType注册Kernel对象,由用户实现注册给框架 可以按照:
+ 1. Place 执行设备
+ 1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+ 1. Memory layout: 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+ 1. 使用的库
+
+ 来区分Kernel,为同一个operator注册多个 Kernel。
+
+ ```cpp
+ struct OpKernelType {
+ proto::DataType data_type_;
+ DataLayout data_layout_;
+ platform::Place place_;
+ LibraryType library_type_;
+ }
+ ```
+
+
+
+---
+
+### 多设备支持(三)
+
+
+
+step 3: 运行时的 KernelType 推断和Kernel切换,按需要修改Kernel推断和Kernel切换规则
+- Expected Kernel:期待调用的Kernel:由(1)`Place`和计算精度决定;或(2)用户在配置中显示指定使用的计算库,如`cudnn`、`mkldnn`等。
+- Actual Kernel:运行时从`Operator`的输入(`Variable`)可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候,框架会插入`data_transformer`或者`data_layerout_transform`等,保证Expected Kernel可以执行,包括:
+ - CPUPlace -> GPUPlace :跨设备内存复制
+ - NCHW -> nChw8c :Layout转换
+ - FP32 -> FP16 :精度转换 _**尚未支持**_
+ - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+
+
+- 循环执行一段`Program`,直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处:
+ 1. while_op 没有 kernel
+ 1. while_op 拥有自己的`Block`,会形成一段嵌套的`Block`
+ 1. ==while_op 内部创建了一个 Executor,来循环执行`Block`==
+
+- while_op 输入输出 : LoDTensorArray
+ ```cpp
+ namespace paddle {
+ namespace framework {
+ using LoDTensorArray = std::vector;
+ }
+ }
+ ```
+ - 每一次循环,从原始输入中“切出”一个片段
+ - LoDTensorArray 在Python端暴露,是Fluid支持的基础数据结构之一,用户可以直接创建并使用
+
+
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+
+
+```cpp
+
+void Run(const framework::Scope &scope,
+ const platform::Place &dev_place) const override {
+ PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+ auto &cond = scope.FindVar(Input(kCondition))->Get();
+ PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+ framework::Executor executor(dev_place);
+ auto *block = Attr(kStepBlock);
+
+ auto *program = block->Program();
+ auto step_scopes =
+ scope.FindVar(Output(kStepScopes))->GetMutable();
+
+ while (cond.data()[0]) {
+ auto ¤t_scope = scope.NewScope();
+ step_scopes->push_back(¤t_scope);
+ executor.Run(*program, ¤t_scope, block->ID(),
+ false /*create_local_scope*/);
+ }
+}
+
+```
+
+
+
+---
+### while_op 的重要应用:Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+
+
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据,在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`,将会构成一个循环神经网络,否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+
+
+---
+
+#### `dynamic RNN` 用户接口
+
+
+
+
+
+
+- `dynamicRNN` 中的重要元素
+ 1. **step input**: `dynamicRNN` 每个时间步的输入
+ 1. **step function**: 用户定义的单步计算
+ 1. **memory**: 用于形成循环连接
+ 1. **external/static memory**:单步计算的每一步都可以全部读取到的外部输入
+
+
+
+---
+
+#### dynamicRNN 中的 Memory
+
+
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+ - `memory` “指向” 一个operator的输出变量,记作: A
+ - `memory` 可以被 LoDTensor 初始化(当LoD信息为空时,为非序列,否则为序列),默认`memory`被初始化为零
+ - `memory` 在 operator A 前向计算之后,进行前向计算
+ - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+ - `memory` 的输出可以是另一个 operator 的输入,于是形成了“循环”连接
+
+
+
+---
+
+### DynamicRNN 实现细节
+
+
+
+- `while_op` 无法独立构成dynamicRNN,必须和一组相关的 operator 及数据结构配合
+ - 依赖的 operators (这里仅列出最重要的,并非全部):
+ - `lod_rank_table` operator
+ - `lod_tensor_to_array` operator
+ - `array_to_lod_tensor` operator
+ - `shrink_memory` operator
+ - 依赖的数据结构
+ - `TensorArray`
+ - `LoDRankTable`
+
+- 在Fluid中,RNN接受变长序列输入,无需填充,以上数据结构和相关的operator配合工作,实现了对变长输入以batch计算
+
+
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+
+
+- 问题:
+ - RNN 可以看作是一个展开的前向网络,前向网络的深度是最长序列的长度
+ - 如果不对变长序列进行填充,将它们填充到一样长度,每个mini-batch输入将会不等长,每个样本展开长度不一致,导致前向和反向计算实现困难
+
+
+
+----
+##### 实例 :RNN encoder-decoder with attention
+
+
+
+- 以机器翻译的RNN encoder-decoder 模型(涉及了`dynamicRNN`的所有设计要素)为例,下图是 RNN encoder-decoder 的原始输入:
+
+ 
Figure. RNN encoder-decoder 原始batch 输入数据
+
+
+- source word sequences 是encoder RNN的输出,是一个LoDTensor
+- target word sequences 是look_uptable的输入,是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间,表示一个dense vector
+
+
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+
+
+1. 对一个mini batch中不等长样本进行排序,最长样本变成batch中的第一个,最短样本是batch中最后一个
+ - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+ - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入:随着时间步增加,每个时间步的batch输入可能会逐渐缩小
+ - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序,还原会原始输入顺序
+ - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+
+
+---
+
+### 运行实例
+
+
+
+
+
+---
+### 运行实例
+
+
+
+
+
+
+
+- 执行到第5~7个batch时,batch size将会缩小
+
+
+
+---
+### 运行实例
+
+
+
+
+
+
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么?
+ - `memory` 指向某个operator的输出Tensor,在该operator前向计算之后,“取回”其计算结果
+ - 5 ~ 7时,遇到了序列的结束,==下一个时间步计算不再需要在已经结束的序列上展开==
+ - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+
+
+---
+### 运行实例:batch 1 ~ 2
+
+
+
Figure. 第1、2个batch输入dynamicRNN的batch输入
+
+
+---
+### 运行实例:batch 3 ~ 4
+
+
+
Figure. 第3、4个batch输入dynamicRNN的batch输入
+
+
+---
+
+### 运行实例:batch 5 ~ 7
+
+
+
Figure. 第5、6、7个batch输入dynamicRNN的batch输入
+
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+
+
+
+代码结构 |
+模块结构 |
+
+
+
+
+
+
+
+
+ |
+
+
+
+
+ |
+
+
+
+
+
+---
+
+### ==8.== 文档总结
+
+---
+
+
+- 设计概览
+ - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+ - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+ - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+ - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+ - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+ - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+ - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+ - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+ - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+ - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+ - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+ - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+ - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+ - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+ - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+ - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+ - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+ - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境:使用 Docker 编译和测试
+
+
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址:[->](
+ https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+ ```bash
+ docker pull paddlepaddle/paddle:latest-dev
+ ```
+
+1. 启动 docker container
+
+ ```bash
+ docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+ ```
+
+1. 进入docker container后,从源码编译,请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+
+
+---
+
+### 一些说明
+
+
+
+1. PaddlePaddle的Docker镜像为了减小体积,默认没有安装vim,可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像,其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像,主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序,推荐使用nvidia-docker,[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+
+
+ ```bash
+ nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+ ```
+
+
+
+
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+
+
+- ==提交PullRequest前请务必阅读==: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+ 1. 代码注释遵守 Doxygen 的样式
+ 1. 确保编译器选项 WITH_STYLE_CHECK 已打开,并且编译能通过代码样式检查
+ 1. 所有代码必须具有单元测试,且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+ 1. 帮助格式化源代码(C++,Python)
+ 1. 在提交前自动检查一些基本事宜:如每个文件只有一个 EOL,Git 中不要添加大文件等
+ 1. 安装pre-commit,并在PaddlePaddle根目录运行:
+ ```bash
+ ➜ pip install pre-commit
+ ➜ pre-commit install
+ ```
+
+
+---
+
+### 如何贡献
+
+
+
+1. 开始开发之前请先建立issue。
+ - 让其它同学知道某项工作已经有人在进行,以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考:[->](https://help.github.com/articles/closing-issues-using-keywords/)
+ - 目的:为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能,为了解决什么样的问题。
+ - 当PR被merge后,关联的issue会被自动关闭。
+1. PR review 中,reviewer的每条comment都必须回复。
+ - 如修改完可直接回复:Done。
+ - 目的:review comment 中可能会有(1)询问类型的问题;(2)可以在下一个PR修改的问题;(3)comment意见不合理等。需要明确回复,以便reviewer和其他人有历史可查,便于区分是否已经进行修改,或者准备下一个PR修改,或者意见不合理可以不用进行修改。
+
+
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+
+
+添加一个新的operator,会涉及实现以下C++类的派生类:
+
+1. `framework::OperatorBase`: Operator(简写,Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类,称作Kernel。
+1. `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
+1. `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel,可以将Op分为两种:
+1. 包含Kernel的Op:继承自OperatorWithKernel,==绝大多数operator都属于这一类==
+1. 不包含kernel的Op,继承自OperatorBase,只有少量Op属于这一类,例如while_op,ifelse_op
+
+这里主要介绍带Kernel的Op如何编写。
+
+
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件?
+
+
+
+
+
+
+内容 |
+定义位置 |
+
+
+
+
+
+OpProtoMake定义
+ |
+
+`.cc`文件,Backward Op不需要OpProtoMaker
+ |
+
+
+
+Op定义
+ |
+
+`.cc`文件
+ |
+
+
+
+Kernel实现
+ |
+
+CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
+ |
+
+
+
+
+注册Op
+ |
+
+Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
+ |
+
+
+
+
+
+- 添加 Operator 之前请阅读:[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。
+- 根据文件名自动构建op和Python端绑定,请务必遵守以上命名,否则需要进一步修改PyBind相关文件及CMakeLists.txt。
+
+
+---
+
+###### 实现带Kernel的Operator step1: 定义ProtoMaker类
+
+
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式:$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释(*下面代码段的中注释进行了简化,实现时需按照规范添加注释*):
+
+ ```cpp
+ template
+ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X","(Tensor)The input of clip op.");
+ AddOutput("Out", "(Tensor),The output of clip op.");
+ AddAttr(
+ "min", "(float),Minimum value.");
+ AddAttr(
+ "max", "(float),Maximum value.");
+ AddComment(R"DOC(
+ ……
+ )DOC");
+ }
+ };
+ ```
+
+
+
+---
+
+###### 实现带Kernel的Operator step2: 定义Operator类
+
+
+
+下面的代码段实现了`clip_op`的定义:
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("X"),
+ "Input(X) of ClipOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Out"),
+ "Output(Out) of ClipOp should not be null.");
+ auto x_dims = ctx->GetInputDim("X");
+ auto max = ctx->Attrs().Get("max");
+ auto min = ctx->Attrs().Get("min");
+ PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+ ctx->SetOutputDim("Out", x_dims);
+ ctx->ShareLoD("X", /*->*/ "Out");
+ }
+};
+```
+
+
+---
+
+### Operator 类中需要完成的工作
+
+
+
+1. clip_op 继承自`OperatorWithKernel`,
+
+ ```cpp
+ using framework::OperatorWithKernel::OperatorWithKernel;
+ ```
+ 表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+ - `InferShape` 为const函数,不能修改Op的成员变
+ - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`,从中可获取到输入输出以及属性
+ - `InferShape` 会被调用两次,一次是编译时(创建op),一次是运行时(调用op的`Run`方法时),需要完成以下功能:
+ 1. 做检查, 尽早报错:检查输入数据维度、类型等是否合法
+ 2. 设置输出Tensor的形状
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。
+
+
+
+---
+
+### 补充说明
+
+
+
+1. `InferShape`目前支持两种实现方式,二者最后都会生成一个functor注册给OpInfo结构体。
+ 1. 继承framework::InferShapeBase,实现为一个functor(参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22))
+ 2. override InferShape函数(参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24))
+
+1. 什么是`functor` ?
+
+ - 类或结构体仅重载了`()`,一般是可被多个kernel复用的计算函数。
+
+
+
+ ```cpp
+ template
+ class CrossEntropyFunctor {
+ public:
+ void operator()(const platform::CPUDeviceContext& ctx,
+ framework::Tensor* out,
+ const framework::Tensor* prob,
+ const framework::Tensor* labels, const bool softLabel) {
+ ……
+ }
+ };
+ ```
+
+
+ - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法: [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+
+
+---
+
+###### 实现带Kernel的Operator step3: 定义OpKernel类
+
+
+
+- `ClipKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
+ 1. `typename DeviceContext`: 表示设备类型,不同设备共享同一个Kernel时,需添加该模板参数。不共享时,需要提供针对不同设备的特化实现。
+ 1. `typename T` : 表示支持的数据类型,如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+ 1. `Compute`接受输入参数:`const framework::ExecutionContext& context`
+ - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起,使得Op在调用`Compute`方法时,能够简单地通过名字拿到需要的输入输出`Variable`
+ - 与`InferShapeContext`相比,`ExecutionContext` 中增加了设备类型
+ 1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+
+
+---
+#### ClipKernel 代码概览
+
+
+
+```cpp
+template
+class ClipKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto max = context.Attr("max");
+ auto min = context.Attr("min");
+ auto* x = context.Input("X");
+ auto* out = context.Output("Out");
+ T* out_data = out->mutable_data(context.GetPlace());
+ const T* x_data = x->data();
+ int64_t numel = x->numel();
+ Transform trans;
+ trans(context.template device_context(), x_data,
+ x_data + numel, out_data, ClipFunctor(min, max));
+ }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用, Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装,可以在`Compute`接口中直接调用
+ - 关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+
+
+---
+###### 实现带Kernel的Operator step4: 实现反向Op
+
+
+
+- ==**反向Op没有`ProtoMaker`**==,除此之外定义与实现方式前向Op完全一致,不再赘述
+- 这里仅对反向Op的输入输出进行说明:
+ 1. 反向Op的输入
+ - 前向Op的输出
+ - 反向传播过程中传递给当前Op的梯度
+ - 需要注意,Fluid中,不区分Cost Op和中间层Op,所有Op都必须正确处理接收到的梯度
+ 2. 反向Op的输出
+ - 对可学习参数的求导结果
+ - 对所有输入的求导结果
+
+
+
+
+---
+
+###### 实现带Kernel的Operator step5: 注册Op及Kernel
+
+
+
+至此Op和Op kernel都已经实现完毕,接下来,需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
+
+
+
+ ```cpp
+ namespace ops = paddle::operators;
+ REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad,
+ ops::ClipOpGrad);
+ REGISTER_OP_CPU_KERNEL(
+ clip, ops::ClipKernel);
+ REGISTER_OP_CPU_KERNEL(
+ clip_grad, ops::ClipGradKernel);
+ ```
+
+ - 在上面的代码片段中:
+
+ 1. `REGISTER_OP` : 注册`ops::ClipOp`类,类型名为`clip`,该类的`ProtoMaker`为`ops::ClipOpMaker`,注册`ops::ClipOpGrad`,类型名为`clip_grad`
+ 1. `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op,例如:优化算法相关的Op
+ 1. `REGISTER_OP_CPU_KERNEL` :注册`ops::ClipKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::ClipGradKernel`类
+
+
+1. 按照同样方法,在`.cu`文件中注册GPU Kernel
+ - 如果CUDA Kernel的实现基于Eigen,需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU`
+
+
+
+---
+
+##### 编译和Python端绑定
+
+
+
+- 运行下面命令可以仅编译新添加的Op:
+
+ ```
+ make mul_op
+ ```
+ - 需注意,运行单元测试需要编译整个工程
+
+- 如果遵循前文的文件命名规则,构建过程中,会自动为新增的op添加Python端绑定,并链接到生成的lib库中
+
+
+
+---
+
+###### 实现带Kernel的Operator step6: 添加前向单测及梯度检测
+
+
+
+- 新增Op的单元测试统一添加至:[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+ 1. Op单元测试继承自`OpTest`,各项具体的单元测试在`TestClipOp`里完成,所有单测case都以`TestXX`命名
+ 1. 单元测试Operator,需要:
+ 1. 在`setUp`函数定义输入、输出,以及相关的属性参数
+ 1. 生成随机的输入数据
+ 1. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比
+ 1. 反向梯度检测流程测试框架已经实现,直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py),这里不再展开
+
+
+
+---
+#### 编译执行单测
+
+
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+ - 运行单元测试测时需要编译整个工程,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后,执行下面的命令来运行单元测试:
+
+ ```bash
+ make test ARGS="-R test_mul_op -V"
+ ```
+
+ 或者:
+
+ ```
+ ctest -R test_mul_op
+ ```
+
+
+---
+
+### 添加Op的一些注意事项
+
+
+
+- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,将会导致编译出错。
+- 注册Op时的类型名,需要和该Op的名字一样。不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`,会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel,不要创建空的`*_op.cu`,会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
+
+
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+
+
+- 当在python端执行时:
+ ```python
+ import paddle.v2.fluid as fluid
+ ```
+ [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+ ```python
+ # program is a global instance.
+ _main_program_ = Program()
+ _startup_program_ = Program()
+ ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时,可以调用调用:
+ ```python
+ def switch_main_program(program):
+ """
+ Switch the main program to a new program.
+ This funtion returns the previous main program.
+ """
+ ……
+ ```
+
+
+---
+
+### 自定义参数的初始化
+
+
+
+- 调用`fluid.ParamAttr(……)`接口,自定义参数的初始化
+
+ ```python
+ w_param_attrs = ParamAttr(name=None,
+ initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+ learning_rate=1.0,
+ regularizer=L1Decay(1.0),
+ trainable=True,
+ clip=GradientClipByValue(-1.0, 1.0),
+ )
+ y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+ ```
+
+- 补充问题:如何创建 `Variable`
+ ```python
+ cur_program = Program()
+ cur_block = cur_program.current_block()
+ new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+ ```
+
+
+
+---
+
+### 添加反向Op
+
+
+
+- 调用`fluid.backward.append_backward(X)`(`X`是一个Variable),来为一段前向`ProgramDesc`添加反Op
+
+ ```python
+ data = fluid.layers.data(name="data", shape=(2,3,4))
+ out = fluid.layers.fc(input=data,size=128,act=None)
+ loss = fluid.layers.reduce_sum(out)
+ fluid.backward.append_backward(loss=loss)
+ ```
+
+- 添加优化相关的Op
+ ```python
+ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+ sgd_optimizer.minimize(loss)
+ ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后,调用下面的接口执行内存优化:
+ ```python
+ fluid.memory_optimize(fluid.default_main_program())
+ ```
+ - _注:内存优化目前仍在持续开发中,有可能不够稳定。_
+
+
+
+---
+
+### 总结:编译时执行流程
+
+
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法,添加相关的状态 variable of optimizer 到`default_startup_program`
+ - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op,添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+
+
+---
+
+### Feed 数据 (一):通过 feed 字典
+
+
+
+- 执行executor的run方法时,指定feed字典,feed op 会将指定的数据放到`x`和`y`两个Variable中
+ ```python
+ y_data = np.random.randint(0, 8, [1]).astype("int32")
+ y_tensor = core.Tensor()
+ y_tensor.set(y_data, place)
+
+ x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+ x_tensor = core.Tensor()
+ x_tensor.set(x_data, place)
+ ……
+ cost = exe.run(
+ fluid.default_main_program(),
+ feed={'x': x_tensor,
+ 'y': y_tensor},
+ fetchlist=[avg_cost])
+ ```
+
+- 这种方法较为底层,一般用于单测中
+
+
+
+---
+
+### Feed 数据 (二):使用 DataFeeder接口
+
+
+
+- 编写一个data_reader函数,data_reader是一个Python generator
+
+ ```python
+ def demo_reader():
+ def random_generator():
+ yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+ return random_generator
+ ```
+- 在训练任务中使用 DataFeeder 接口
+ ```python
+ cost = exe.run(
+ fluid.default_main_program(),
+ feed={'x': x_tensor,
+ 'y': y_tensor},
+ fetchlist=[avg_cost])
+
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+ feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+ for data in train_reader():
+ cost = exe.run(
+ fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[cost])
+ ```
+
+
+
+---
+
+### 常见问题
+
+
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+ ```python
+ accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+ for pass_id in range(PASS_NUM):
+ accuracy.reset()
+ for data in train_reader():
+ loss, acc = exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost] + accuracy.metrics)
+ pass_acc = accuracy.eval(exe)
+ # acc 当前一个batch 的 accuracy
+ # pass_acc 当前batch 的 accuracy
+ pass_total_acc = accuracy.eval(exe) # 整个pass的accuracy
+ ```
+
+- 如何在训练中测试?[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program,并交替运行? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile?Fluid 实现了profile 工具,可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+
+
+---
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..67daf566f91aab570e60971c4ea8e2be876e214d
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..43367777f41449a666e7a3b571f09ac5d5dfb1ae
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..481021ef306e2596818aab7fe17a570754f63635
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4279f41e06de459f18b9a622539511d555e9a0af
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..75369f5378309e0f304b83f6bb69bdb195eac079
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..b29c0d779e3d46b779b5baeabe3176adaeb00a6d
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa99472c0f914cde128fd7b3bd8dc29ac24f94b6
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..554782ba54e43efc3d6babbb94e3cac3530ac649
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..4219efccbb1e87839adf6b5720fe46808b7d2fcf
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e46db4c9c6f5b65ff274b498b716b11de343a8b0
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3975b06f615b7a88dfc11e71b6451fdf4ce42d60
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000000000000000000000000000000000..14e77511d639af155e5a3725cde05323e0cc94f2
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8e459e1b3d5c8706b3caa05dc371db8d46df4a5
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..0656336914ece957f2e5bb4d70ad337a63e31d88
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000000000000000000000000000000000..db5bfa1231345add8661b4f8ef0fc9d861f40d24
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0725f92d2b169c2b59ec7c68b402859c2a2dd1d8
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..59b0de6fb36f9f6b469227c05760a7612bb30b4d
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff601128368ee179e3fd33e5e295a9ddd3dcbaeb
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..422973c0dc7aa2b544d2fc86a97ace706388cb9e
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000000000000000000000000000000000..ffc94e3d8945ec6291460afd90e8fcc600828390
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 077f5e9b189269f9f6c9cf68310e2bfd43d8cb67..741c01ce5428c0046daa5a784da70d4bb492438c 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
# 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像
docker build -t paddle:dev .
# 3. 执行下面的命令编译CPU-Only的二进制
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步)
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
-注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。如果使用自行
-构建的镜像(上述第4步)会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
+注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
@@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
.. code-block:: bash
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` ):
.. code-block:: bash
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
- bash /paddle/paddle/scripts/docker/build.sh
- cd /paddle/build
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+ ./paddle/scripts/paddle_build.sh build
+ cd build
ctest -R test_sum_op -V
.. _faq_docker:
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index 545e61ce9602240807d515e9eae971dfca9ddd7f..b06c43e19dcfc52ad0f074a85517a16744895a3a 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
# 2. Optional: build development docker image from source
docker build -t paddle:dev .
# 3. Run the following command to build a CPU-Only binaries
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
-(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
-command in step 3.
+into :code:`/paddle` directory inside docker container.
When the compile finishes, you can get the output whl package under
build/python/dist, then you can choose to install the whl on local
@@ -74,15 +72,15 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
If you wish to run only one unit test, like :code:`test_sum_op`:
.. code-block:: bash
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
- bash /paddle/paddle/scripts/docker/build.sh
- cd /paddle/build
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+ ./paddle/scripts/paddle_build.sh build
+ cd build
ctest -R test_sum_op -V
.. _faq_docker:
diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst
index da876b03e384a8175b27f78756af648c80fc6784..106c86bace075764c84bc2a7f7cb09d466fa8794 100644
--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -98,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
国内用户可以使用下面的镜像源来加速访问:
- .. code-block: bash
+ .. code-block:: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst
index 5dbdedc4cb064ef415e8d19f00727a16d1c175c6..25aecb8d0da9feb00006da6259b529b7011d91cb 100644
--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -105,7 +105,7 @@ We provide a packaged book image, simply issue the command:
For users in China, we provide a faster mirror:
- .. code-block: bash
+ .. code-block:: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
diff --git a/paddle/.gitignore b/paddle/.gitignore
index 1c1c0c2c829f088d7e3f52ca007fcb8f33a16a36..01904aa6ef2057afee95ddd6e30cde064b06c52e 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -11,7 +11,6 @@ GTAGS
*.pb.cc
*.pb.h
*_pb2.py
-paddle_*
output/
google/
Makefile
diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4b19256ef4533a09162edf907f6cd51146517e46
--- /dev/null
+++ b/paddle/contrib/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(inference)
diff --git a/paddle/contrib/float16/README.md b/paddle/contrib/float16/README.md
index ded959c47cb81b9384abbb9815773e25969344ec..58b4a50666bfb622af8acbce29355f2a4a870a82 100644
--- a/paddle/contrib/float16/README.md
+++ b/paddle/contrib/float16/README.md
@@ -89,7 +89,7 @@ cd Paddle
# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
nvidia-docker build -t paddle:float16 .
# After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
```
#### Accuracy
diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
index d8a34ee67b8fab214fa6e96104304689211f84da..031225a85dabb26e5d9ea06f58909c049e7f0c08 100755
--- a/paddle/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -3,7 +3,7 @@
BUILD_PATH=/paddle/fp16_build
WHEEL_PATH=$BUILD_PATH/python/dist
INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/contrib/float16
+DEMO_PATH=/paddle/paddle/contrib/float16
# Use the single most powerful CUDA GPU on your machine
export CUDA_VISIBLE_DEVICES=0
@@ -50,7 +50,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
- --data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
--repeat=$REPEAT \
@@ -68,7 +67,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_resnet \
- --data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
--repeat=$REPEAT \
@@ -86,7 +84,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
- --data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
--repeat=$REPEAT \
@@ -104,7 +101,6 @@ do
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
- --data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
--repeat=$REPEAT \
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a4fe10f708e5bb8b28e34b2d91b2254c346c467f
--- /dev/null
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST)
+ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs ARGS)
+ cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+ set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+ set(arg_list "")
+ if(inference_test_ARGS)
+ foreach(arg ${inference_test_ARGS})
+ list(APPEND arg_list "_${arg}")
+ endforeach()
+ else()
+ list(APPEND arg_list "_")
+ endif()
+ foreach(arg ${arg_list})
+ string(REGEX REPLACE "^_$" "" arg "${arg}")
+ cc_test(${TARGET_NAME}
+ SRCS ${TEST_SRC}
+ DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+ # set_tests_properties(${TARGET_NAME}
+ # PROPERTIES DEPENDS ${DEP_TEST})
+ endforeach()
+endfunction(inference_api_test)
+
+
+cc_library(paddle_inference_api
+ SRCS paddle_inference_api.cc
+ DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+cc_library(paddle_inference_api_impl
+ SRCS paddle_inference_api_impl.cc
+ DEPS paddle_inference_api paddle_fluid_api)
+
+cc_test(test_paddle_inference_api
+ SRCS test_paddle_inference_api.cc
+ DEPS paddle_inference_api)
+
+inference_api_test(test_paddle_inference_api_impl
+ test_paddle_inference_api_impl.cc
+ test_word2vec)
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d67e1e7667800d6dd00cb8915b0d6dc7c664970b
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index dbaa7c95b97e954537707566e5b7458e6afd14c8..9ac8ebdef8151f2a144b479fa258b8bc830fc2e9 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -12,49 +12,74 @@
See the License for the specific language governing permissions and
limitations under the License. */
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
#pragma once
+#include
#include
#include
namespace paddle {
-class Predictor {
-public:
- struct Attr;
- Predictor() = default;
+enum PaddleDType {
+ FLOAT32,
+ INT64,
+};
- // Build the network before inference.
- bool Init(const Attr& attr);
+struct PaddleBuf {
+ void* data; // pointer to the data memory.
+ size_t length; // number of memory bytes.
+};
+
+struct PaddleTensor {
+ std::string name; // variable name.
+ std::vector shape;
+ PaddleBuf data; // blob of data.
+ PaddleDType dtype;
+};
+
+/*
+* A simple Inference API for Paddle. Currently this API might just be used by
+* non-sequence scenerios.
+* TODO(Superjomn) Prepare another API for NLP-related usages.
+*/
+class PaddlePredictor {
+public:
+ struct Config;
+ PaddlePredictor() = default;
+ PaddlePredictor(const PaddlePredictor&) = delete;
// Predict an record.
- // Arguments:
- // inputs: the name of the input variables.
- // outputs: the name of the output varaibles.
- // input_shapes: the shape of the input variables.
- // output_shapes: the shape of the output variables.
- // input_data: the data of the input variables.
- // output_data: the data of the output variables.
- bool Run(const std::vector& inputs,
- const std::vector& outputs,
- const std::vector>& input_shapes,
- const std::vector>& output_shapes,
- const std::vector>& input_data,
- std::vector>* output_data);
-
- // Clone a predictor that share the model weights.
- Predictor* Clone();
+ // The caller should be responsible for allocating and releasing the memory of
+ // `inputs`. `inputs` should be alive until Run returns. caller should be
+ // responsible for releasing the memory of `output_data`.
+ virtual bool Run(const std::vector& inputs,
+ std::vector* output_data) = 0;
+
+ // Clone a predictor that share the model weights, the Cloned predictor should
+ // be thread-safe.
+ virtual std::unique_ptr Clone() = 0;
// Destroy the Predictor.
- ~Predictor();
+ virtual ~PaddlePredictor() {}
- struct Attr {
+ friend std::unique_ptr CreatePaddlePredictor(
+ const PaddlePredictor::Config& config);
+
+ // The common configs for all the predictors.
+ struct Config {
enum class EngineKind;
std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on
- // third-party engines.
- EngineKind engine_kind{Attr::EngineKind::kNone};
+ // third-party engines.
+ EngineKind engine_kind{Config::EngineKind::kNone};
enum class EngineKind {
kNone = -1, // Use the native Fluid facility.
@@ -66,4 +91,8 @@ public:
};
};
+// A factory to help create difference predictor.
+template
+std::unique_ptr CreatePaddlePredictor(const ConfigT& config);
+
} // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ecca16d3f82bbeee6858883a0f9e577a479f9d06
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -0,0 +1,309 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include
+#include
+#include