Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_add_random_crop_op

632108dc · fengjiayi · 20c8ff0f · 9d723b8c · 632108dc · 632108dc
115 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,6 +34,14 @@ repos:
        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+-   repo: local
+    hooks:
+    -   id: pylint-doc-string
+        name: pylint
+        description: Check python docstring style using docstring_checker.
+        entry: bash ./tools/codestyle/pylint_pre_commit.hook
+        language: system
+        files: \.(py)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:

--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,8 @@ env:
 addons:
  ssh_known_hosts: 13.229.163.131
 before_install:
+  # For pylint dockstring checker
+  - sudo pip install pylint pytest astroid isort
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,7 +57,10 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
+option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
+option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -202,7 +205,7 @@ endif(USE_NNPACK)

 add_subdirectory(proto)

-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
    # "add_subdirectory(go)" should be placed after the following loine,
    # because it depends on paddle/optimizer.
    add_subdirectory(paddle/optimizer)
@@ -230,3 +233,7 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
+
+if (WITH_CONTRIB)
+    add_subdirectory(paddle/contrib)
+endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -79,6 +79,9 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python

+#For docstring checker
+RUN pip install pylint pytest astroid isort
+
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt

@@ -101,6 +104,3 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
    rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
-# Cluster Training Benchmark
-
-## Setup
-
- Platform
-  - Kubernetes: v1.6.2
-  - Linux Kernel: v3.10.0
-
- Resource
-  - CPU: 10 Cores per Pod
-  - Memory: 5GB per Pod
-
- Docker Image
-
-  We use different base Docker Image to run the benchmark on Kubernetes:
-  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
-  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
-  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
- Model
-  vgg16 is used in this benchmark.
-
-## Cases
-
- Variable
-  - Batch Size of training data.
-  - PServer count of the training job.
-  - The number of trainers.
-
- Invariant
-  - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
- PServer Count: 40
- Trainer Count: 100
- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure the Performance for Different PServer Count
-
- Trainer Count: 100
- Batch Size: 64
- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>PServer Count  </th>
-<th>10</th>
-<th>20</th>
-<th>40 </th>
-<th>60</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
- PServer Count: 20
- Batch Size: 64
- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-<table>
-<thead>
-<tr>
-<th>Trainer Counter  </th>
-<th>1</th>
-<th>10</th>
-<th>20 </th>
-<th>30</th>
-<th>40</th>
-<th>50</th>
-<th>60 </th>
-<th>70</th>
-<th>80</th>
-<th>90</th>
-<th>100 </th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-
-## Reproduce the benchmark
-
-TODO
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz		: 2101.000
- cache size	: 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 15.44 </td>
-<td> 16.32 </td>
-<td> 16.74 </td>
-<td> 16.79 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 15.97 </td>
-<td> 17.04 </td>
-<td> 17.60 </td>
-<td> 17.83 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> 9.09 </td>
-<td> 9.10 </td>
-<td> 9.24 </td>
-<td> 8.66 </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Batch Size
-
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 190.20 </td>
-<td> 222.15 </td>
-<td> 247.40 </td>
-<td> 258.18 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 170.96 </td>
-<td> 233.71 </td>
-<td> 256.14 </td>
-<td> 329.23 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-### Accelerate Rate
-
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Trainer Count </th>
-<th>20</th>
-<th>40</th>
-<th>80</th>
-<th>100</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 263.29 (78.64%) </td>
-<td> 518.80 (77.47%) </td>
-<td> 836.26 (62.44%) </td>
-<td> 1019.29 (60.89%) </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 326.85 (92.85%) </td>
-<td> 534.58 (75.93%) </td>
-<td> 853.30 (60.60%) </td>
-<td> 1041.99 (59.20%) </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Pserver Count
-
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
-
-<table>
-<thead>
-<tr>
-<th>PServer Count </th>
-<th>3</th>
-<th>6</th>
-<th>10</th>
-<th>20</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid(should fix in next PR) </td>
-<td> 589.1 </td>
-<td> 592.6 </td>
-<td> 656.4 </td>
-<td> 655.8 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 593.4 </td>
-<td> 791.3 </td>
-<td> 729.7 </td>
-<td> 821.7 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: MKL_NUM_THREADS
-          value: "1"
-        - name: TRAINING_ROLE
-          value: "PSERVER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        command: ["paddle_k8s", "start_fluid"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16job
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_fluid"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: TRAINING_ROLE
-          value: "TRAINER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ b/benchmark/cluster/vgg16/run_vgg_dist.sh
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
--- a/benchmark/cluster/vgg16/tf_k8s
+++ b/benchmark/cluster/vgg16/tf_k8s
-#!/bin/bash
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
-  pserver_label="tf-job-pserver=${JOB_NAME}"
-  trainer_label="tf-job-trainer=${JOB_NAME}"
-
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
-  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
-  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
-  wait_running_pods
-
-  label="tf-job-pserver=${JOB_NAME}"
-  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
-  wait_running_pods
-
-  label="tf-job-trainer=${JOB_NAME}"
-  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-  check_trainer_ret $?
-}
-
-start_tf(){
-    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
-        start_tf_trainer
-    else
-        start_tf_pserver
-    fi
-}
-
-usage() {
-    echo "usage: tf_k8s [<args>]:"
-    echo "  start_tf         Start tensorflow jobs"
-}
-
-case "$1" in
-    start_tf)
-        start_tf
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ b/benchmark/cluster/vgg16/tf_pserver.yaml
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-tf-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        tf-job-pserver: vgg16job-tf
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: TF_JOB_NAME 
-          value: "ps"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ b/benchmark/cluster/vgg16/tf_trainer.yaml
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-tf-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        tf-job-trainer: vgg16job-tf
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: TF_JOB_NAME 
-          value: "worker"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16v2job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16v2job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "python train.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        command: ["paddle_k8s", "start_pserver"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16v2job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16v2job
-    spec:
-      imagePullSecrets:
-        - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: BATCH_SIZE
-          value: "256"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "2"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=16, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='flowers',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--local',
-    type=str2bool,
-    default=True,
-    help='Whether to run as local mode.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--trainer_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(batch_acc)
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
-        args.device_id)
-    exe = fluid.Executor(place)
-
-    # test
-    def test(exe):
-        test_pass_acc = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            outs = exe.run(inference_program,
-                           feed={"pixel": img_data,
-                                 "label": y_data},
-                           fetch_list=[batch_acc, batch_size])
-            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
-        return test_pass_acc.eval()
-
-    def train_loop(exe, trainer_prog):
-        iters = 0
-        ts = time.time()
-        train_pass_acc = fluid.average.WeightedAverage()
-        for pass_id in range(args.num_passes):
-            # train
-            start_time = time.time()
-            num_samples = 0
-            train_pass_acc.reset()
-
-            def run_step(batch_id, data):
-                img_data = np.array(
-                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        "float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = y_data.reshape([-1, 1])
-
-                loss, acc, b_size = exe.run(
-                    trainer_prog,
-                    feed={"pixel": img_data,
-                          "label": y_data},
-                    fetch_list=[avg_cost, batch_acc, batch_size])
-                return loss, acc, b_size
-
-            if args.profile:
-                with profiler.profiler('All', 'total',
-                                       '/tmp/profile_vgg_%d' % args.task_index):
-                    for batch_id, data in enumerate(train_reader()):
-                        if batch_id > 5: break
-                        run_step(batch_id, data)
-
-            total_time = 0.0
-            count = 0
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
-                loss, acc, b_size = run_step(batch_id, data)
-                iters += 1
-                num_samples += len(data)
-                train_pass_acc.add(value=acc, weight=b_size)
-
-                duration = time.time() - ts
-                total_time += duration
-                count += len(data)
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
-                                                   len(data) / duration,
-                                                   count / total_time)
-                )  # The accuracy is the accumulation of batches, but not the current batch.
-
-            pass_elapsed = time.time() - start_time
-            pass_train_acc = train_pass_acc.eval()
-            pass_test_acc = test(exe)
-            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                  "Train accuracy = %f, Test accuracy = %f\n" %
-                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_train_acc, pass_test_acc))
-
-    if args.local:
-        # Parameter initialization
-        exe.run(fluid.default_startup_program())
-
-        # data reader
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                else paddle.dataset.flowers.train(),
-                buf_size=5120),
-            batch_size=args.batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            batch_size=args.batch_size)
-        train_loop(exe, fluid.default_main_program())
-    else:
-        trainers = int(os.getenv("TRAINERS"))  # total trainer count
-        print("trainers total: ", trainers)
-
-        training_role = os.getenv(
-            "TRAINING_ROLE",
-            "TRAINER")  # get the training role: trainer/pserver
-
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            trainer_id=args.task_index,
-            pservers=args.ps_hosts,
-            trainers=trainers)
-
-        if training_role == "PSERVER":
-            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
-                "PADDLE_INIT_PORT")
-            if not current_endpoint:
-                print("need env SERVER_ENDPOINT")
-                exit(1)
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            # Parameter initialization
-            exe.run(fluid.default_startup_program())
-
-            # data reader
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                    else paddle.dataset.flowers.train(),
-                    buf_size=5120),
-                batch_size=args.batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
-                paddle.dataset.flowers.test(),
-                batch_size=args.batch_size)
-
-            trainer_prog = t.get_trainer_program()
-            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-            exe.run(fluid.default_startup_program())
-            train_loop(exe, trainer_prog)
-        else:
-            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NHWC',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, NCHW=[batch, channels, height, width].'
-    'Only support NHWC right now.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--worker_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
-    def __init__(self):
-        self.parameters = []
-
-    def batch_norm_relu(self, inputs, is_training):
-        """Performs a batch normalization followed by a ReLU."""
-        # We set fused=True for a significant speed boost. See
-        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
-        inputs = tf.layers.batch_normalization(
-            inputs=inputs,
-            axis=1 if args.data_format == 'NCHW' else -1,
-            momentum=0.9,
-            epsilon=1e-05,
-            center=True,
-            scale=True,
-            training=is_training,
-            fused=True)
-        inputs = tf.nn.relu(inputs)
-        return inputs
-
-    def conv_bn_layer(self,
-                      name,
-                      images,
-                      kernel_shape,
-                      is_training,
-                      drop_rate=0.0):
-        with tf.name_scope(name) as scope:
-            kernel = tf.Variable(
-                tf.truncated_normal(
-                    kernel_shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            conv = tf.nn.conv2d(
-                images,
-                kernel, [1, 1, 1, 1],
-                data_format=args.data_format,
-                padding='SAME')
-            biases = tf.Variable(
-                tf.constant(
-                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(conv, biases)
-            out = self.batch_norm_relu(out, is_training)
-            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
-            return out
-
-    def fc_layer(self, name, inputs, shape):
-        with tf.name_scope(name) as scope:
-            fc_w = tf.Variable(
-                tf.truncated_normal(
-                    shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            fc_b = tf.Variable(
-                tf.constant(
-                    0.0, shape=[shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
-            return out
-
-    def network(self, images, class_dim, is_training):
-        """ VGG16 model structure.
-
-            TODO(kuke): enable this network to support the 'NCHW' data format
-        """
-
-        # conv1
-        conv1_1 = self.conv_bn_layer(
-            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
-        conv1_2 = self.conv_bn_layer(
-            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
-        # pool1
-        pool1 = tf.nn.max_pool(
-            conv1_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool1')
-        # conv2
-        conv2_1 = self.conv_bn_layer(
-            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
-        conv2_2 = self.conv_bn_layer(
-            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
-        # pool2
-        pool2 = tf.nn.max_pool(
-            conv2_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool2')
-        # conv3
-        conv3_1 = self.conv_bn_layer(
-            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
-        conv3_2 = self.conv_bn_layer(
-            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
-        conv3_3 = self.conv_bn_layer(
-            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
-        # pool3
-        pool3 = tf.nn.max_pool(
-            conv3_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool3')
-        # conv4
-        conv4_1 = self.conv_bn_layer(
-            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
-        conv4_2 = self.conv_bn_layer(
-            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv4_3 = self.conv_bn_layer(
-            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool4
-        pool4 = tf.nn.max_pool(
-            conv4_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # conv5
-        conv5_1 = self.conv_bn_layer(
-            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_2 = self.conv_bn_layer(
-            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_3 = self.conv_bn_layer(
-            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool5
-        pool5 = tf.nn.max_pool(
-            conv5_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # flatten
-        shape = int(np.prod(pool5.get_shape()[1:]))
-        pool5_flat = tf.reshape(pool5, [-1, shape])
-        # fc1
-        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
-        fc1 = self.fc_layer('fc1', drop, [shape, 512])
-        # fc2
-        bn = self.batch_norm_relu(fc1, is_training)
-        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
-        fc2 = self.fc_layer('fc2', drop, [512, 512])
-
-        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
-        return fc3
-
-
-def run_benchmark(cluster_spec, server):
-    """Run benchmark on cifar10 or flowers."""
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        raw_shape = (3, 32, 32)
-        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
-            None, 3, 32, 32)
-    else:
-        class_dim = 102
-        raw_shape = (3, 224, 224)
-        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
-            None, 3, 224, 224)
-
-    device = tf.train.replica_device_setter(
-        worker_device="/job:worker/task:{}".format(args.task_index),
-        cluster=cluster_spec)
-
-    with tf.device(device):
-        images = tf.placeholder(tf.float32, shape=dat_shape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        vgg16 = VGG16Model()
-        logits = vgg16.network(images, class_dim, is_training)
-        loss = tf.losses.softmax_cross_entropy(
-            onehot_labels=onehot_labels, logits=logits)
-        avg_loss = tf.reduce_mean(loss)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        global_step = tf.Variable(0, name='global_step', trainable=False)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
-        summary_op = tf.summary.merge_all()
-        init_op = tf.global_variables_initializer()
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-
-    # test
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-         map(lambda x: np.transpose(x[0].reshape(raw_shape),
-         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        return np.mean(test_accs)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1,
-        inter_op_parallelism_threads=1,
-        log_device_placement=True)
-    config.gpu_options.allow_growth = True
-
-    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
-    with tf.train.MonitoredTrainingSession(
-            master=server.target,
-            is_chief=(args.task_index == 0),
-            hooks=hooks,
-            config=config) as sess:
-        iters, num_samples, start_time = 0, 0, 0.0
-        for pass_id in range(args.num_passes):
-            # train
-            num_samples = 0
-            start_time = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                train_images = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
-                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-                train_labels = np.array(map(lambda x: x[1], data)).astype(
-                    'int64')
-                iter_begin_time = time.time()
-                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
-                                        feed_dict={
-                                            images: train_images,
-                                            labels: train_labels,
-                                            is_training: True
-                                        })
-                iters += 1
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - iter_begin_time)))
-                num_samples += len(data)
-            train_elapsed = time.time() - start_time
-            # test
-            pass_test_acc = test()
-            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
-                  (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    print_arguments()
-
-    ps_hosts = args.ps_hosts.split(",")
-    worker_hosts = args.worker_hosts.split(",")
-
-    # Create a cluster from the parameter server and worker hosts.
-    cluster_spec = tf.train.ClusterSpec({
-        "ps": ps_hosts,
-        "worker": worker_hosts
-    })
-
-    # Create and start a server for the local task.
-    server = tf.train.Server(
-        cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
-    if args.job_name == "ps":
-        print("start pserver")
-        server.join()
-    elif args.job_name == "worker":
-        print("start worker")
-        run_benchmark(cluster_spec, server)
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
-    BATCH_SIZE = int(BATCH_SIZE)
-else:
-    BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
-    def conv_block(input, num_filter, groups, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=input,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            pool_type=paddle.pooling.Max())
-
-    assert len(nums) == 5
-    # the channel of input feature is 3
-    conv1 = conv_block(input, 64, nums[0], 3)
-    conv2 = conv_block(conv1, 128, nums[1])
-    conv3 = conv_block(conv2, 256, nums[2])
-    conv4 = conv_block(conv3, 512, nums[3])
-    conv5 = conv_block(conv4, 512, nums[4])
-
-    fc_dim = 512
-    fc1 = paddle.layer.fc(input=conv5,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=fc1,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    out = paddle.layer.fc(input=fc2,
-                          size=class_dim,
-                          act=paddle.activation.Softmax())
-    return out
-
-
-def vgg13(input, class_dim):
-    nums = [2, 2, 2, 2, 2]
-    return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
-    nums = [2, 2, 3, 3, 3]
-    return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
-    nums = [2, 2, 4, 4, 4]
-    return vgg(input, nums, class_dim)
-
-
-def main():
-    global ts
-    paddle.init(use_gpu=False)
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
-    extra_layers = None
-    # NOTE: for v2 distributed training need averaging updates.
-    learning_rate = 1e-3 / NODE_COUNT
-    out = vgg16(image, class_dim=CLASS_DIM)
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # Create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
-                                                         BATCH_SIZE),
-        learning_rate=learning_rate / BATCH_SIZE,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=128000 * 35,
-        learning_rate_schedule="discexp", )
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar.train10(),
-            # To use other data, replace the above line with:
-            # reader.train_reader('train.list'),
-            buf_size=1000),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        cifar.test10(),
-        # To use other data, replace the above line with:
-        # reader.test_reader('val.list'),
-        batch_size=BATCH_SIZE)
-
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 extra_layers=extra_layers,
-                                 is_local=False)
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        global ts, ts_pass
-        if isinstance(event, paddle.event.BeginPass):
-            ts_pass = time.time()
-        if isinstance(event, paddle.event.BeginIteration):
-            ts = time.time()
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    time.time() - ts)
-        if isinstance(event, paddle.event.EndPass):
-            print "Pass %d end, spent: %f" % (event.pass_id,
-                                              time.time() - ts_pass)
-            result = trainer.test(reader=test_reader)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    trainer.train(
-        reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,22 +24,22 @@ Currently supported `--model` argument include:

 * Run the following command to start a benchmark job locally:
    ```bash
-      python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test
+      python fluid_benchmark.py --model mnist  --device GPU
    ```
    You can choose to use GPU/CPU training. With GPU training, you can specify
-    `--parallel 1` to run multi GPU training.
+    `--gpus <gpu_num>` to run multi GPU training.
 * Run distributed training with parameter servers:
    * start parameter servers:
        ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        ```
    * start trainers:
        ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        ```
 * Run distributed training using NCCL2
    ```bash
-    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
    ```

 ## Run Distributed Benchmark on Kubernetes Cluster
@@ -48,7 +48,7 @@ We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submi
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:

 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
 ```

 Then the yaml files are generated under directory `myjob`, you can run:

--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -94,6 +94,10 @@ def parse_args():
        '--memory_optimize',
        action='store_true',
        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
    parser.add_argument(
        '--update_method',
        type=str,
@@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
        exe.run(train_prog)
        return

+    if args.use_fake_data:
+        raise Exception(
+            "fake data is not supported in single GPU test for now.")
+
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(startup_prog)
@@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
                   num_trainers, trainer_id):
+    feed_var_list = [
+        var for var in train_prog.global_block().vars.itervalues()
+        if var.is_data
+    ]
+    # generate fake:
+    if args.use_fake_data:
+        for var in feed_var_list:
+            v = startup_prog.global_block().clone_variable(var)
+            var.persistable = True
+            v.persistable = True
+
+            real_shape = list(var.shape)
+            real_shape[0] = args.batch_size / args.gpus
+            startup_prog.global_block().append_op(
+                outputs={"Out": v},
+                type="fill_constant",
+                attrs={"shape": real_shape,
+                       "value": 1.0,
+                       "dtype": var.dtype})
+
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if nccl_id_var and trainer_id == 0:
+        #FIXME(wuyi): wait other trainer to start listening
+        time.sleep(30)
+
    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
@@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
        exec_strategy=strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
-        if var.is_data
-    ]
+
    feeder = fluid.DataFeeder(feed_var_list, place)
    for pass_id in range(args.pass_num):
        num_samples = 0
@@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                num_samples = 0
            if iters == args.iterations:
                break
-            loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.use_fake_data:
+                loss, = exe.run([avg_loss.name])
+            else:
+                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
            if args.update_method == "pserver":
                exe.bcast_params()
            num_samples += len(data)

--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -112,6 +112,7 @@ def gen_job():
    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
    envs.append({"name": "ENTRY", "value": args.entry})
    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
    # NOTE: these directories below are cluster specific, please modify
    # this settings before you run on your own cluster.
    envs.append({

--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -54,5 +54,13 @@ envs = [
                "fieldPath": "status.podIP"
            }
        }
+    },
+    {
+        "name": "PADDLE_CURRENT_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
    }
 ]
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)

+if(EIGEN_USE_THREADS)
+    add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
+
 if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
+            -DCMAKE_SKIP_RPATH=ON
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1003,9 +1003,9 @@ dice_loss
 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:

-bilinear_interp
+upsampling_bilinear2d
 ____

-..  autofunction:: paddle.fluid.layers.bilinear_interp
+..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
    :noindex:

--- a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid？
+
+---
+
+### 两个基础问题
+
+<font size=6>
+
+1. 如何描述机器学习模型和优化过程？
+    - 完备自洽，表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算？
+    - 支持异步设备、多卡、分布式计算
+    - 降低计算/计算优化的开发成本
+    - ……
+
+</font>
+
+---
+
+### 如何描述模型和优化过程？
+
+<font size=6>
+
+<table>
+<thead>
+<tr>
+<th> </th>
+<th>一组连续执行的layers</th>
+<th>variable和operator构成的计算图 </th>
+<th>不再有模型的概念 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> 2013</td>
+<td> Caffe，Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> 2015 </td>
+<td> </td>
+<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td> </td>
+<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+
+### <p align="center">目标 </p>
+
+<font size=6>
+
+- 提高对各类机器学习任务的描述能力：能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰，各模块充分解耦：内外部贡献者能够专注于自己所需的功能模块，基于框架进行再次开发。
+- 从设计上，留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下，实现自动可伸缩，自动容错的分布式计算。
+
+</font>
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- <span style="background-color:#ACD6FF;">[编译器式的执行流程，区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
+<br>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_compiler.png" width=100%>
+</p>
+
+---
+
+#### 让我们在Fluid程序实例中，区分编译时和运行时
+
+---
+### Fluid 编译时
+
+<font size=5>
+
+- ==**定义前向计算**==
+
+  ```python
+  x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+  y_predict = fluid.layers.fc(input=x, size=1, act=None)
+  y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+  cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+  avg_cost = fluid.layers.mean(x=cost)
+  ```
+
+- ==**添加反向、正则、优化**==
+  ```python
+  learning_rate = 0.01
+  sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+  sgd_optimizer.minimize(avg_cost)
+  ```
+</font>
+
+---
+
+### `Program` vs. 计算图
+
+<font size=5>
+
+- 在科学计算领域，计算图是一种描述计算的经典方式。下图展示了从前向计算图（蓝色）开始，通过添加反向（红色）和优化算法相关（绿色）操作，构建出整个计算图的过程：
+- 
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
+</p>
+
+  
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成，相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算（这里可以先简单的理解为是一段有序的计算流）`Program`，为这段前向计算按照：前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序，添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+</font>
+
+---
+
+### Fluid 运行时
+
+<font size=5>
+
+- ==**读入数据**==
+
+  ```python
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+      batch_size=20)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  ```
+- ==**定义执行程序的设备**==
+  ```python
+  place = fluid.CPUPlace()
+  feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+  ```
+
+- ==创建执行器（Executor），执行初始化 `Program`和训练`Program`==
+
+  ```python
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+  PASS_NUM = 100
+  for pass_id in range(PASS_NUM):
+      for data in train_reader():
+          avg_loss_value, = exe.run(fluid.default_main_program(),
+                                    feed=feeder.feed(data),
+                                    fetch_list=[avg_cost])
+          print(avg_loss_value)
+  ```
+</font>
+
+---
+
+### 总结：框架做什么？用户做什么？
+<br>
+
+<font size=5>
+<table>
+<thead>
+<tr>
+<th>构建训练</th>
+<th>执行训练</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<span style="background-color:#B3D9D9">用户</span>：描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>：添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>：添加并行/多设备/分布式相关的计算单元
+</td>
+
+<td>
+<span style="background-color:#DAB1D5;">框架</span>：创建Operator（计算）+ Variable（数据）<br><span style="background-color:#DAB1D5;">框架</span>：创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>：内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>：执行计算
+</td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### <p align="center">总结：编译时</p>
+<font size=5>
+
+<span style="background-color:#A3D1D1;">**用户编写一段Python程序，描述模型的前向计算**</span>
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状，进行静态检查：`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. （可选）添加多卡/多机相关的Operator，生成在多卡/多机上运行的程序
+
+</font>
+
+---
+
+### <p align="center">总结：运行时</p>
+<font size=5>
+
+<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
+1. 创建`Executor`
+1. 为将要执行的一段计算，在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`，依次执行`Block`
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
+<font size=3> Figure. 编译时运行时概览</font>
+</p>
+
+</font>
+
+---
+<!-- *template: invert -->
+## ==3==. 用户如何描述计算？
+---
+
+### Fluid：==像写程序一样==定义计算
+<font size=5>
+
+- 顺序执行
+    ```python
+    x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+   ```python
+   a = fluid.Var(10)
+   b = fluid.Var(0)
+
+   switch = fluid.switch()
+   with switch.block():
+      with switch.case(fluid.less_equal(a, 10)):
+          fluid.print("Case 1")
+      with switch.case(fluid.larger(a, 0)):
+          fluid.print("Case 2")
+      with switch.default():
+          fluid.print("Case 3")
+   ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+</font>
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+<font size=5>
+
+- 循环：[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+  ```python
+  d0 = layers.data("d0", shape=[10], dtype='float32')
+  data_array = layers.array_write(x=d0, i=i)
+  array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+  cond = layers.less_than(x=i, y=array_len)
+  while_op = layers.While(cond=cond)
+  with while_op.block():
+      d = layers.array_read(array=data_array, i=i)
+      i = layers.increment(x=i, in_place=True)
+      layers.array_write(result, i=i, array=d)
+      layers.less_than(x=i, y=array_len, cond=cond)
+  ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search  [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+</font>
+
+---
+
+#### <p align="center">总结</p>
+
+<font size=5>
+
+1. 用户层提供的描述语法具有完备性、自洽性，有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言，认知能够直接迁移
+1. 能够支持：定义问题，逐步求解
+
+</font>
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 ：==变量和计算的描述==
+
+<font size=5>
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+    - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
+
+  - 在Fluid中，一个神经网络任务（训练/预测）被描述为一段`Program`
+  - `Program`包含对`Variable`（数据）和 `Operator`（对数据的操作）的描述
+  - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`，构成一段完整的`Fluid Program`
+
+
+>编译阶段最终，经过 Transpiler 的执行规划，变换处理，生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+</font>
+
+--- 
+
+### 编译时概念 ：==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+<font size=5>
+
+1. 接受一段`ProgramDesc`作为输入，生成一段新的`ProgramDesc`
+
+    - *Memory optimization transpiler*：向原始`ProgramDesc` 中插入 `FreeMemoryOps`，在一次迭代优化结束前提前释放内存，使得能够维持较小的 memory footprint
+
+    - *Distributed training transpiler*：将原始的`ProgramDesc`中转化为对应的分布式版本，生成两段新的`ProgramDesc`:
+        1. trainer进程执行的`ProgramDesc`
+        1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`，生成可直接被`gcc`, `nvcc`, `icc`等编译的代码，编译后得到可执行文件
+
+</font>
+
+---
+### Transplier
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/transpiler.png" width=70%>
+</p>
+
+---
+
+### 打印 `ProgramDesc`
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/print_fluid_program.png" width=70%>
+</p>
+
+<font size=5>
+
+- `default_startup_program`：创建可学习参数，对参数进行初始化
+- `default_main_program`：由用户定义的模型，包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+  ```python
+  from paddle.v2.fluid import debuger
+  print debuger.pprint_program_codes(framework.default_main_program().desc)
+  ```
+</font>
+
+---
+### 输出效果
+
+<font size=5>
+
+<table>
+<thead>
+<th>variable in block 0</th>
+<th>variable in block 0</th>
+</thead>
+<tbody>
+<tr>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc1.png" width=70%></td>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc2.png" width=70%></td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### 运行时概念
+
+<font size=5>
+
+- 数据相关
+  - `Tensor` / `LoDTensor` / `Variable`
+  - `Scope`
+
+- 计算相关
+  - `Block` 
+  - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+<table>
+<thead>
+<th></th>
+<th>protobuf messages</th>
+<th>C++ class objects</th>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+</td>
+<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+</td>
+</tr>
+
+<tr>
+<td>Operation</td>
+<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+</td>
+<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+</td>
+</tr>
+<tr>
+<td>Block</td>
+<td>BlockDesc
+</td>
+<td>Block
+</td>
+</tr>
+
+
+</tbody>
+</table>
+
+- 执行相关 ：`Executor` 
+
+</font>
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+<font size=5>
+
+- Tensor 是$n$-dimensional arry的推广，LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出，网络中的可学习参数全部统一使用LoDTensor（n-dimension array）表示
+- 一个mini-batch输入数据是一个LoDTensor
+  - 在Fluid中，RNN 处理变长序列无需padding，得益于 `LoDTensor`表示
+  - 可以简单将 LoD 理解为：`std::vector<std::vector<int>>`
+  - 对非序列数据，LoD 信息为空
+
+<table>
+<thead>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</thead>
+<tbody>
+<tr>
+<td>RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+
+<tr>
+<td>recursive RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+<tr>
+<td>padding zeros</td>
+<td>Must
+</td>
+<td>No need
+</td>
+<tr>
+<td>blob data type</td>
+<td>Tensor
+</td>
+<td>LODTensor
+</td>
+
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+#### LoD 信息实例
+
+<font size=4>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LoDTensor.png" width=43%>
+</p>
+
+- 图(a)的LoD 信息
+  ```cpp
+  [0, 5, 8, 10, 14]
+  ```
+- 图(b)的 LoD 信息
+  ```cpp
+  [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+  ```
+</font>
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
+</p>
+<font size=5>
+
+1. `Block` 是一个实现层的概念，不在应用层暴露给用户。目前用户无法自行创建并利用`Block`，用户能够感知的只有`Program`这个概念。
+1. 逻辑上，可以将 `Block` 类比为编程语言中的大括号：定义了一段作用域，其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`，`Block`是可嵌套的，因此`Scope`也是可嵌套的
+
+</font>
+
+---
+### Executor
+
+<font size=5>
+
+<table>
+<thead>
+<th>接口</th>
+<th>说明</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/executor.png" width=60%>
+</p></td>
+<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
+</td>
+</tr>
+</tbody>
+</table>
+
+---
+### Operator/OpWithKernel/Kernel
+<font size=5>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator1.png" width=50%>
+</p>
+
+- operator 无状态，Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel：while_op 、ifelse op
+
+</font>
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+<font size=5>
+
+<table>
+<thead>
+<th>Layer</th>
+<th>Operator</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/layer.png" width=70%>
+</p></td>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator2.png" width=73%>
+</p></td>
+</tr>
+
+<tr>
+<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
+<td>1. 内部无状态<br>2. 只有Run方法</td>
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间，最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算（Operators/Kernels）完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+<font size=5>
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口：
+  ```cpp
+  template <typename Place>
+  void* Alloc(Place place, size_t size);
+
+  template <typename Place>
+  void Free(Place place, void* ptr);
+
+  template <typename Place>
+  size_t Used(Place place);
+
+  struct Usage : public boost::static_visitor<size_t> {
+    size_t operator()(const platform::CPUPlace& cpu) const;
+    size_t operator()(const platform::CUDAPlace& gpu) const;
+  };
+  ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时，需特化支持的 `Place`， 提供以上三个接口的实现
+
+</font>
+
+---
+### 代码结构
+
+<font size=5>
+
+内存管理模块可以理解为由以下两部分构成：
+
+1. SystemAllocator：实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator：内存管理算法
+
+</font>
+
+---
+### System Allocator
+
+<font size=5>
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+    - 不同设备上的内存分配和回收终将转化为标准接口调用
+    - 为不同设备实现MemoryAllocator，继承自SystemAllocator
+
+  ```cpp
+  class SystemAllocator {
+   public:
+    virtual ~SystemAllocator() {}
+    virtual void* Alloc(size_t& index, size_t size) = 0;
+    virtual void Free(void* p, size_t size, size_t index) = 0;
+    virtual bool UseGpu() const = 0;
+  };
+  ```
+</font>
+
+---
+
+### CPU/GPU Allocator
+
+<font size=5>
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator，分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后，将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+</font>
+
+---
+### CPU Allocator
+
+<font size=5>
+
+- CPU 内存的分配提供两种选项：
+    1. non-pinned memory：可分页内存
+    2. pinned memory：页锁定内存
+        - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少，影响系统性能，默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+   ```cpp
+   DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+   DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+                 "Default use 100% of CPU memory for PaddlePaddle,"
+                 "reserve the rest for page tables, etc");
+   ```
+
+</font>
+
+---
+### GPU Allocator
+
+<font size=5>
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+    - 如果可用显存小于请求分配大小，调用cudaMalloc进行分配
+    - 如果可用显存不足，目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小：
+
+  ```cpp
+  DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+                "Default use 92% of GPU memory for PaddlePaddle,"
+                "reserve the rest for page tables, etc");
+  ```
+
+</font>
+
+---
+#### 内存管理算法:  [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+<font size=5>
+
+- Memory Arena：一次性分配大块连续内存，之后会基于这块内存进行内存管理：动态分配、释放、重用内存块。
+- 伙伴内存分配：
+    - 将内存划分为 2 的幂次方个分区，使用 best-fit 方法来分配内存请求。
+    - 当释放内存时，检查 buddy 块，查看相邻的内存块是否也已被释放。如果是，将内存块合并，以最小化内存碎片。
+    - 分配的内存在物理内存的自然边界对齐，提高内存访问效率。
+    - 算法的时间效率高，单使用 best-fit 方法的缘故，会产生一定的内存浪费
+
+</font>
+
+---
+
+### Buddy Allocator
+
+<font size=5>
+
+- BuddyAllocator 是一个单例，每个设备（如： GPU/CPU(0)/GPU(1)） 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时，将会调用SystemAllocator去指定的设备上分配物理内存
+
+</font>
+
+---
+### 实例：CPU 下内存管理接口的实现
+
+<font size=5>
+
+- 对上层应用，统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+    ```cpp
+    template <>
+    void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+      VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+      void* p = GetCPUBuddyAllocator()->Alloc(size);
+      VLOG(10) << "  pointer=" << p;
+      return p;
+    }
+
+    template <>
+    void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+      VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+      GetCPUBuddyAllocator()->Free(p);
+    }
+
+    template <>
+    size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+      return GetCPUBuddyAllocator()->Used();
+    }
+    ```
+</font>
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持（一）
+
+<font size=5>
+
+- step 1：添加Place类型，<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
+   - 可以将Place类型理解为一个整数加上一个枚举型，包括：设备号 + 设备类型
+   
+    <p align="center">
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
+    </p>
+- DeviceContext
+    - 不同的Place会对应一个相应的DeviceContext，用于组织管理与设备相关的信息
+      - 例如，GpuDeviceContext中会管理Cuda stream
+    - 目前实现中一些特殊的库也会对应有自己的DeviceContext：例如：
+      ```cpp
+      class MKLDNNDeviceContext : public CPUDeviceContext {……}
+      ```
+    - 每种设备对应的DeviceContext需要管理的内容不尽相同，视具体需求来实现
+
+</font>
+
+---
+
+### 多设备支持（二）
+
+<font size=5>
+
+- step 2: 增加KernelType，为相应的KernelType注册Kernel对象，<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照：
+    1. Place 执行设备
+    1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+    1. Memory layout： 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+    1. 使用的库
+ 
+    来区分Kernel，为同一个operator注册多个 Kernel。
+
+    ```cpp
+    struct OpKernelType {
+      proto::DataType data_type_;
+      DataLayout data_layout_;
+      platform::Place place_;
+      LibraryType library_type_;
+    }
+    ```
+
+</font>
+
+---
+
+### 多设备支持（三）
+
+<font size=5>
+
+step 3: 运行时的 KernelType 推断和Kernel切换，<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
+- Expected Kernel：期待调用的Kernel：由（1）`Place`和计算精度决定；或（2）用户在配置中显示指定使用的计算库，如`cudnn`、`mkldnn`等。
+- Actual Kernel：运行时从`Operator`的输入（`Variable`）可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候，框架会插入`data_transformer`或者`data_layerout_transform`等，保证Expected Kernel可以执行，包括：
+   - CPUPlace -> GPUPlace ：跨设备内存复制
+   - NCHW -> nChw8c ：Layout转换
+   - FP32 -> FP16 ：精度转换 _**尚未支持**_
+   - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+</font>
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+<font size=5>
+
+- 循环执行一段`Program`，直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处：
+  1. while_op 没有 kernel
+  1. while_op 拥有自己的`Block`，会形成一段嵌套的`Block`
+  1. ==while_op 内部创建了一个 Executor，来循环执行`Block`==
+
+- while_op 输入输出 ： LoDTensorArray
+    ```cpp
+    namespace paddle {
+    namespace framework {
+    using LoDTensorArray = std::vector<LoDTensor>;
+    }
+    } 
+    ```
+    - 每一次循环，从原始输入中“切出”一个片段
+    - LoDTensorArray 在Python端暴露，是Fluid支持的基础数据结构之一，用户可以直接创建并使用
+
+</font>
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+<font size=5>
+
+```cpp
+
+void Run(const framework::Scope &scope,
+         const platform::Place &dev_place) const override {
+  PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+  auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+  PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+  framework::Executor executor(dev_place);
+  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+  auto *program = block->Program();
+  auto step_scopes =
+      scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+  while (cond.data<bool>()[0]) {
+    auto &current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+    executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+  }
+}
+  
+```
+
+</font>
+
+---
+### while_op 的重要应用：Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+<font size=5>
+<br>
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据，在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`，将会构成一个循环神经网络，否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+</font>
+
+---
+
+#### `dynamic RNN` 用户接口
+<font size=5>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/user_interface.png" width=75%>
+</p>
+
+- `dynamicRNN` 中的重要元素
+  1. **step input**: `dynamicRNN` 每个时间步的输入
+  1. **step function**: 用户定义的单步计算
+  1. **memory**: 用于形成循环连接
+  1. **external/static memory**：单步计算的每一步都可以全部读取到的外部输入
+
+</font>
+
+---
+
+#### dynamicRNN 中的 Memory 
+
+<font size=5>
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+  - `memory` “指向” 一个operator的输出变量，记作： A
+  - `memory` 可以被 LoDTensor 初始化（当LoD信息为空时，为非序列，否则为序列）,默认`memory`被初始化为零
+  - `memory` 在 operator A 前向计算之后，进行前向计算
+  - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+  - `memory` 的输出可以是另一个 operator 的输入，于是形成了“循环”连接
+   
+</font>
+
+---
+
+### DynamicRNN 实现细节
+
+<font size=5>
+
+- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>，必须和一组相关的 operator 及数据结构配合
+    - 依赖的 operators (这里仅列出最重要的，并非全部):
+        - `lod_rank_table` operator
+        - `lod_tensor_to_array` operator
+        - `array_to_lod_tensor` operator
+        - `shrink_memory` operator
+    - 依赖的数据结构
+        - `TensorArray`
+        - `LoDRankTable`
+
+- 在Fluid中，RNN接受变长序列输入，无需填充，以上数据结构和相关的operator配合工作，实现了对变长输入以batch计算
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+- 问题：
+  - RNN 可以看作是一个展开的前向网络，前向网络的深度是最长序列的长度
+  - 如果不对变长序列进行填充，将它们填充到一样长度，每个mini-batch输入将会不等长，每个样本展开长度不一致，导致前向和反向计算实现困难
+
+</font>
+
+----
+##### 实例 ：RNN encoder-decoder with attention
+
+<font size=5>
+
+- 以机器翻译的RNN encoder-decoder 模型（涉及了`dynamicRNN`的所有设计要素）为例，下图是 RNN encoder-decoder 的原始输入：
+  <p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
+  </p>
+
+- source word sequences 是encoder RNN的输出，是一个LoDTensor
+- target word sequences 是look_uptable的输入，是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间，表示一个dense vector
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+1. 对一个mini batch中不等长样本进行排序，最长样本变成batch中的第一个，最短样本是batch中最后一个
+      - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+          - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入：随着时间步增加，每个时间步的batch输入可能会逐渐缩小
+    - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序，还原会原始输入顺序
+    - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+</font>
+
+---
+
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sorted_input.png" width=100%>
+</p>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=100%>
+</p>
+
+<font size=5>
+
+- 执行到第5~7个batch时，batch size将会缩小
+
+</font>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=80%>
+</p>
+
+<font size=5>
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么？
+    - `memory` 指向某个operator的输出Tensor，在该operator前向计算之后，“取回”其计算结果
+    - 5 ~ 7时，遇到了序列的结束，==下一个时间步计算不再需要在已经结束的序列上展开==
+    - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+</font>
+
+---
+### 运行实例：batch 1 ~ 2
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### 运行实例：batch 3 ~ 4
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+
+### 运行实例：batch 5 ~ 7
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+<table>
+<thead>
+<tr>
+<th>代码结构</th>
+<th>模块结构</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
+</p> 
+</td>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_2.png" width=60%>
+</p>
+</td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+### ==8.== 文档总结
+
+---
+<font size=5>
+
+- 设计概览
+  - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md) 
+  - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md) 
+  - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+  - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+  - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+  - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) 
+  - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+  - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+  - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+  - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+  - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+  - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+  - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+  - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+  - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+  - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+  - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+  - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md) 
+
+</font>
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境：使用 Docker 编译和测试
+
+<font size=5>
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+    
+PaddlePaddle 在 Dockerhub 地址：[->](
+    https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+   
+1. 获取PaddlePaddle的Docker镜像
+    ```bash
+    docker pull paddlepaddle/paddle:latest-dev
+    ```
+
+1. 启动 docker container
+
+    ```bash
+    docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+    ```
+
+1. 进入docker container后，从源码编译，请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+  
+</font>
+
+---
+
+### 一些说明
+
+<font size=5>
+
+1. PaddlePaddle的Docker镜像为了减小体积，默认没有安装vim，可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像，其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像，主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序，推荐使用nvidia-docker，[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+   <font size=4>
+   
+   ```bash
+   nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+   ```
+   </font>
+
+
+</font>
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+<font size=5>
+
+- ==提交PullRequest前请务必阅读==： [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+    1. 代码注释遵守 Doxygen 的样式
+    1. 确保编译器选项 WITH_STYLE_CHECK 已打开，并且编译能通过代码样式检查
+    1. 所有代码必须具有单元测试，且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+    1. 帮助格式化源代码（C++，Python）
+    1. 在提交前自动检查一些基本事宜：如每个文件只有一个 EOL，Git 中不要添加大文件等
+    1. 安装pre-commit，并在PaddlePaddle根目录运行：
+    ```bash
+      ➜  pip install pre-commit
+      ➜  pre-commit install
+    ```
+</font>
+
+---
+
+### 如何贡献
+
+<font size=5>
+
+1. 开始开发之前请先建立issue。
+    - 让其它同学知道某项工作已经有人在进行，以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考：[->](https://help.github.com/articles/closing-issues-using-keywords/)
+    - 目的：为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能，为了解决什么样的问题。
+    - 当PR被merge后，关联的issue会被自动关闭。
+1. PR review 中，reviewer的每条comment都必须回复。
+    - 如修改完可直接回复：Done。
+    - 目的：review comment 中可能会有（1）询问类型的问题；（2）可以在下一个PR修改的问题；（3）comment意见不合理等。需要明确回复，以便reviewer和其他人有历史可查，便于区分是否已经进行修改，或者准备下一个PR修改，或者意见不合理可以不用进行修改。
+
+</font>
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+<font size=5>
+
+添加一个新的operator，会涉及实现以下C++类的派生类：
+
+1. `framework::OperatorBase`: Operator(简写，Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+1. `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+1. `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：
+1. 包含Kernel的Op：继承自OperatorWithKernel，==绝大多数operator都属于这一类==
+1. 不包含kernel的Op，继承自OperatorBase，只有少量Op属于这一类，例如while_op，ifelse_op
+
+<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
+
+</font>
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件？
+
+<font size=5>
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+OpProtoMake定义
+</td>
+<td>
+`.cc`文件，<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
+</td>
+</tr>
+<tr>
+<td>
+Op定义
+</td>
+<td>
+`.cc`文件
+</td>
+</tr>
+<tr>
+<td>
+Kernel实现
+</td>
+<td>
+<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+</td>
+</tr>
+
+<tr>
+<td>
+注册Op
+</td>
+<td>
+Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+</td>
+</tr>
+
+</tbody>
+</table>
+
+- 添加 Operator 之前请阅读：[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+- 根据文件名自动构建op和Python端绑定，<span style="background-color:#DAB1D5;">请务必遵守以上命名，否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>。
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
+
+<font size=5>
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式：$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释（<font size=4>*下面代码段的中注释进行了简化，实现时需按照规范添加注释*</font>）：
+
+    ```cpp
+    template <typename AttrType>
+    class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+     public:
+      ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+          : OpProtoAndCheckerMaker(proto, op_checker) {
+        AddInput("X","(Tensor)The input of clip op.");
+        AddOutput("Out", "(Tensor),The output of clip op.");
+        AddAttr<AttrType>(
+            "min", "(float),Minimum value.");
+        AddAttr<AttrType>(
+            "max", "(float),Maximum value.");
+        AddComment(R"DOC(
+        ……
+    )DOC");
+      }
+    };
+    ```
+  
+</font>
+  
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
+
+<font size=5>
+
+下面的代码段实现了`clip_op`的定义：
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+```
+</font>
+
+---
+
+### Operator 类中需要完成的工作
+
+<font size=5>
+
+1. clip_op 继承自`OperatorWithKernel`，
+
+    ```cpp
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    ```
+    表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+    - `InferShape` 为const函数，不能修改Op的成员变
+    - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`，从中可获取到输入输出以及属性
+    - `InferShape` 会被调用两次，一次是编译时（创建op），一次是运行时（调用op的`Run`方法时），需要完成以下功能：
+        1. 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+        2. 设置输出Tensor的形状
+
+<span style="background-color:#DAB1D5;">通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。</span>
+
+</font>
+
+---
+
+### 补充说明
+
+<font size=5>
+
+1. `InferShape`目前支持两种实现方式，<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
+    1. 继承framework::InferShapeBase，实现为一个functor（参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)）
+    2. override InferShape函数（参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)）
+
+1. 什么是`functor` ?
+   
+   - 类或结构体仅重载了`()`，一般是可被多个kernel复用的计算函数。
+
+        <font size=4>
+        
+        ```cpp
+        template <typename T>
+        class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+         public:
+          void operator()(const platform::CPUDeviceContext& ctx,
+                          framework::Tensor* out,
+                          const framework::Tensor* prob,
+                          const framework::Tensor* labels, const bool softLabel) {
+               ……
+          }
+        };
+        ```
+        </font>
+    
+    - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法： [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+    
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
+
+<font size=5>
+
+- `ClipKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+    1. `typename DeviceContext`: 表示设备类型，不同设备共享同一个Kernel时，需添加该模板参数。不共享时，需要提供针对不同设备的特化实现。
+    1. `typename T` : 表示支持的数据类型，如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+    1. `Compute`接受输入参数：`const framework::ExecutionContext& context`
+        - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起，使得Op在调用`Compute`方法时，能够简单地通过名字拿到需要的输入输出`Variable`
+        - 与`InferShapeContext`相比，`ExecutionContext` 中增加了设备类型
+    1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+</font>
+
+---
+#### ClipKernel 代码概览
+
+<font size=5>
+
+```cpp
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用， Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装，可以在`Compute`接口中直接调用
+    - 关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+</font>
+
+---
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
+
+<font size=5>
+
+- ==**反向Op没有`ProtoMaker`**==，除此之外定义与实现方式前向Op完全一致，不再赘述
+- 这里仅对反向Op的输入输出进行说明：
+    1. 反向Op的输入
+        - 前向Op的输出
+        - 反向传播过程中传递给当前Op的梯度
+            - 需要注意，<span style="background-color:#e1c4c4;">Fluid中，不区分Cost Op和中间层Op，所有Op都必须正确处理接收到的梯度</span>
+    2. 反向Op的输出
+        - 对可学习参数的求导结果
+        - 对所有输入的求导结果 
+
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
+
+<font size=5>
+
+至此Op和Op kernel都已经实现完毕，接下来，需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    <font size=4>
+    
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+                ops::ClipOpGrad);
+    REGISTER_OP_CPU_KERNEL(
+        clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(
+        clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+   
+   - 在上面的代码片段中：
+
+     1. `REGISTER_OP` ： 注册`ops::ClipOp`类，类型名为`clip`，该类的`ProtoMaker`为`ops::ClipOpMaker`，注册`ops::ClipOpGrad`，类型名为`clip_grad`
+     1. `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op，例如：优化算法相关的Op
+     1. `REGISTER_OP_CPU_KERNEL` ：注册`ops::ClipKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::ClipGradKernel`类
+    
+    </font>
+1. 按照同样方法，在`.cu`文件中注册GPU Kernel
+   -  <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen，需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
+
+</font>
+
+---
+
+##### 编译和Python端绑定
+
+<font size=5>
+
+- 运行下面命令可以仅编译新添加的Op：
+
+  ```
+  make mul_op
+  ```
+  - <span style="background-color:#e1c4c4;">需注意，运行单元测试需要编译整个工程</span>
+
+- 如果遵循前文的文件命名规则，构建过程中，会自动为新增的op添加Python端绑定，并链接到生成的lib库中
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
+
+<font size=5>
+
+- 新增Op的单元测试统一添加至：[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+    1. Op单元测试继承自`OpTest`，各项具体的单元测试在`TestClipOp`里完成，所有单测case都以`TestXX`命名
+    1. 单元测试Operator，需要：
+        1. 在`setUp`函数定义输入、输出，以及相关的属性参数
+        1. 生成随机的输入数据
+        1. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比
+        1. 反向梯度检测流程测试框架已经实现，直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py)，这里不再展开
+
+</font>
+
+---
+#### 编译执行单测
+
+<font size=5>
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+    - <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程，并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后，执行下面的命令来运行单元测试：
+
+  ```bash
+  make test ARGS="-R test_mul_op -V"
+  ```
+  
+  或者:
+
+  ```
+  ctest -R test_mul_op
+  ```
+</font>
+
+---
+
+### 添加Op的一些注意事项
+
+<font size=5>
+
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>，将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`</span>，会导致单元测试出错。
+- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel，不要创建空的`*_op.cu`</span>，会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+</font>
+  
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+<font size=5>
+
+- 当在python端执行时：
+    ```python
+    import paddle.v2.fluid as fluid
+    ```
+    [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+    ```python
+    # program is a global instance.
+    _main_program_ = Program()
+    _startup_program_ = Program()
+    ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时，可以调用调用：
+    ```python
+    def switch_main_program(program):
+        """
+        Switch the main program to a new program.
+        This funtion returns the previous main program.
+        """
+        ……
+    ```
+</font>
+
+---
+
+### 自定义参数的初始化
+
+<font size=5>
+
+- 调用`fluid.ParamAttr(……)`接口，自定义参数的初始化
+
+  ```python
+  w_param_attrs = ParamAttr(name=None,
+      initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+      learning_rate=1.0,
+      regularizer=L1Decay(1.0),
+      trainable=True,
+      clip=GradientClipByValue(-1.0, 1.0),
+  )
+  y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+  ```
+
+- 补充问题：如何创建 `Variable`
+  ```python
+  cur_program = Program()
+  cur_block = cur_program.current_block()
+  new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+  ```
+
+</font>
+
+---
+
+### 添加反向Op
+
+<font size=5>
+
+- 调用`fluid.backward.append_backward(X)`（`X`是一个Variable），来为一段前向`ProgramDesc`添加反Op
+
+    ```python
+    data = fluid.layers.data(name="data", shape=(2,3,4))
+    out = fluid.layers.fc(input=data,size=128,act=None)
+    loss = fluid.layers.reduce_sum(out)
+    fluid.backward.append_backward(loss=loss)
+    ```
+
+- 添加优化相关的Op
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(loss)
+    ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后，调用下面的接口执行内存优化：
+  ```python
+  fluid.memory_optimize(fluid.default_main_program())
+  ```
+  - _<span style="background-color:#e1c4c4;">注：内存优化目前仍在持续开发中，有可能不够稳定。</span>_
+
+</font>
+
+---
+
+### 总结：编译时执行流程
+
+<font size=5>
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法，添加相关的状态 variable of optimizer 到`default_startup_program`
+    - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op，添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+</font>
+
+---
+
+### Feed 数据 (一)：通过 feed 字典
+
+<font size=5>
+
+- 执行executor的run方法时，指定feed字典，feed op 会将指定的数据放到`x`和`y`两个Variable中
+  ```python
+  y_data = np.random.randint(0, 8, [1]).astype("int32")
+  y_tensor = core.Tensor()
+  y_tensor.set(y_data, place)
+  
+  x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+  x_tensor = core.Tensor()
+  x_tensor.set(x_data, place)
+  ……
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+  ```
+
+- 这种方法较为底层，一般用于单测中
+
+</font>
+
+---
+
+### Feed 数据 (二)：使用 DataFeeder接口
+
+<font size=5>
+
+- 编写一个data_reader函数，data_reader是一个Python generator
+
+  ```python
+  def demo_reader():
+      def random_generator():
+          yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+      return random_generator
+  ```
+- 在训练任务中使用 DataFeeder 接口
+  ```python
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  for data in train_reader():
+      cost = exe.run(
+          fluid.default_main_program(),
+          feed=feeder.feed(data),
+          fetch_list=[cost])
+  ```
+
+</font>
+
+---
+
+### 常见问题
+
+<font size=5>
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+    ```python
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    for pass_id in range(PASS_NUM):
+        accuracy.reset()
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+             pass_acc = accuracy.eval(exe)
+             # acc 当前一个batch 的 accuracy
+             # pass_acc 当前batch 的 accuracy
+         pass_total_acc = accuracy.eval(exe)  # 整个pass的accuracy
+    ```
+
+- 如何在训练中测试？[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program，并交替运行？ [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile？Fluid 实现了profile 工具，可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+</font>
+
+---
--- a/doc/fluid/images/1.png
+++ b/doc/fluid/images/1.png
--- a/doc/fluid/images/2.png
+++ b/doc/fluid/images/2.png
--- a/doc/fluid/images/3.png
+++ b/doc/fluid/images/3.png
--- a/doc/fluid/images/4.png
+++ b/doc/fluid/images/4.png
--- a/doc/fluid/images/LoDTensor.png
+++ b/doc/fluid/images/LoDTensor.png
--- a/doc/fluid/images/compile_run_time.png
+++ b/doc/fluid/images/compile_run_time.png
--- a/doc/fluid/images/executor.png
+++ b/doc/fluid/images/executor.png
--- a/doc/fluid/images/fluid_examples.png
+++ b/doc/fluid/images/fluid_examples.png
--- a/doc/fluid/images/fluid_module_1.png
+++ b/doc/fluid/images/fluid_module_1.png
--- a/doc/fluid/images/fluid_module_2.png
+++ b/doc/fluid/images/fluid_module_2.png
--- a/doc/fluid/images/layer.png
+++ b/doc/fluid/images/layer.png
--- a/doc/fluid/images/operator1.png
+++ b/doc/fluid/images/operator1.png
--- a/doc/fluid/images/operator2.png
+++ b/doc/fluid/images/operator2.png
--- a/doc/fluid/images/place.png
+++ b/doc/fluid/images/place.png
--- a/doc/fluid/images/print_fluid_program.png
+++ b/doc/fluid/images/print_fluid_program.png
--- a/doc/fluid/images/program_desc1.png
+++ b/doc/fluid/images/program_desc1.png
--- a/doc/fluid/images/program_desc2.png
+++ b/doc/fluid/images/program_desc2.png
--- a/doc/fluid/images/raw_input.png
+++ b/doc/fluid/images/raw_input.png
--- a/doc/fluid/images/scope_variable_tensor.png
+++ b/doc/fluid/images/scope_variable_tensor.png
--- a/doc/fluid/images/sorted_input.png
+++ b/doc/fluid/images/sorted_input.png
--- a/doc/fluid/images/transpiler.png
+++ b/doc/fluid/images/transpiler.png
--- a/doc/fluid/images/user_interface.png
+++ b/doc/fluid/images/user_interface.png
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
-构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。

 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：

@@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安

 .. code-block:: bash

-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test

 如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：

 .. code-block:: bash

-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
   ctest -R test_sum_op -V

 .. _faq_docker:

--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

 NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
-(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
-command in step 3.
+into :code:`/paddle` directory inside docker container.

 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -74,15 +72,15 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.

 .. code-block:: bash

-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test

 If you wish to run only one unit test, like :code:`test_sum_op`:

 .. code-block:: bash

-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
   ctest -R test_sum_op -V

 .. _faq_docker:

--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -98,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note

 国内用户可以使用下面的镜像源来加速访问：

-  .. code-block: bash
+  .. code-block:: bash

    docker run -p 8888:8888 docker.paddlepaddlehub.com/book


--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -105,7 +105,7 @@ We provide a packaged book image, simply issue the command:

 For users in China, we provide a faster mirror:

-  .. code-block: bash
+  .. code-block:: bash

    docker run -p 8888:8888 docker.paddlepaddlehub.com/book


--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -11,7 +11,6 @@ GTAGS
 *.pb.cc
 *.pb.h
 *_pb2.py
-paddle_*
 output/
 google/
 Makefile

--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(inference)
--- a/paddle/contrib/float16/README.md
+++ b/paddle/contrib/float16/README.md
@@ -89,7 +89,7 @@ cd Paddle
 # to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
 nvidia-docker build -t paddle:float16 .
 # After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
 ```

 #### Accuracy

--- a/paddle/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -3,7 +3,7 @@
 BUILD_PATH=/paddle/fp16_build
 WHEEL_PATH=$BUILD_PATH/python/dist
 INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/contrib/float16
+DEMO_PATH=/paddle/paddle/contrib/float16

 # Use the single most powerful CUDA GPU on your machine
 export CUDA_VISIBLE_DEVICES=0
@@ -50,7 +50,6 @@ do
         --repeat=1 \

  $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=imagenet \
      --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
      --repeat=$REPEAT \
@@ -68,7 +67,6 @@ do
         --repeat=1 \

  $INFER_PATH/test_inference_image_classification_resnet \
-      --data_set=imagenet \
      --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
      --repeat=$REPEAT \
@@ -86,7 +84,6 @@ do
         --repeat=1 \

  $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
      --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
      --repeat=$REPEAT \
@@ -104,7 +101,6 @@ do
         --repeat=1 \

  $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
      --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
      --repeat=$REPEAT \

--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+    set(arg_list "")
+    if(inference_test_ARGS)
+        foreach(arg ${inference_test_ARGS})
+            list(APPEND arg_list "_${arg}")
+        endforeach()
+    else()
+        list(APPEND arg_list "_")
+    endif()
+    foreach(arg ${arg_list})
+        string(REGEX REPLACE "^_$" "" arg "${arg}")
+        cc_test(${TARGET_NAME}
+                SRCS ${TEST_SRC}
+                DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        # set_tests_properties(${TARGET_NAME}
+        #         PROPERTIES DEPENDS ${DEP_TEST})
+    endforeach()
+endfunction(inference_api_test)
+
+
+cc_library(paddle_inference_api
+    SRCS paddle_inference_api.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+cc_library(paddle_inference_api_impl
+           SRCS paddle_inference_api_impl.cc
+           DEPS paddle_inference_api paddle_fluid_api)
+
+cc_test(test_paddle_inference_api
+        SRCS test_paddle_inference_api.cc
+        DEPS paddle_inference_api)
+
+inference_api_test(test_paddle_inference_api_impl
+                   test_paddle_inference_api_impl.cc
+                   test_word2vec)
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -12,49 +12,74 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
 #pragma once

+#include <memory>
 #include <string>
 #include <vector>

 namespace paddle {

-class Predictor {
-public:
-  struct Attr;
-  Predictor() = default;
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+};

-  // Build the network before inference.
-  bool Init(const Attr& attr);
+struct PaddleBuf {
+  void* data;     // pointer to the data memory.
+  size_t length;  // number of memory bytes.
+};
+
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+
+/*
+* A simple Inference API for Paddle. Currently this API might just be used by
+* non-sequence scenerios.
+* TODO(Superjomn) Prepare another API for NLP-related usages.
+*/
+class PaddlePredictor {
+public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;

  // Predict an record.
-  // Arguments:
-  //   inputs: the name of the input variables.
-  //   outputs: the name of the output varaibles.
-  //   input_shapes: the shape of the input variables.
-  //   output_shapes: the shape of the output variables.
-  //   input_data: the data of the input variables.
-  //   output_data: the data of the output variables.
-  bool Run(const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs,
-           const std::vector<std::vector<int>>& input_shapes,
-           const std::vector<std::vector<int>>& output_shapes,
-           const std::vector<std::vector<float>>& input_data,
-           std::vector<std::vector<float>>* output_data);
-
-  // Clone a predictor that share the model weights.
-  Predictor* Clone();
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be alive until Run returns. caller should be
+  // responsible for releasing the memory of `output_data`.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data) = 0;
+
+  // Clone a predictor that share the model weights, the Cloned predictor should
+  // be thread-safe.
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;

  // Destroy the Predictor.
-  ~Predictor();
+  virtual ~PaddlePredictor() {}

-  struct Attr {
+  friend std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
+      const PaddlePredictor::Config& config);
+
+  // The common configs for all the predictors.
+  struct Config {
    enum class EngineKind;

    std::string model_dir;      // path to the model directory.
    bool enable_engine{false};  // Enable to execute (part of) the model on
-                                // third-party engines.
-    EngineKind engine_kind{Attr::EngineKind::kNone};
+    // third-party engines.
+    EngineKind engine_kind{Config::EngineKind::kNone};

    enum class EngineKind {
      kNone = -1,          // Use the native Fluid facility.
@@ -66,4 +91,8 @@ public:
  };
 };

+// A factory to help create difference predictor.
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <sys/time.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+
+namespace paddle {
+namespace {
+
+// Timer for timer
+class Timer {
+public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+template <class T>
+std::string num2str(T a) {
+  std::stringstream istr;
+  istr << a;
+  return istr.str();
+}
+}  // namespace
+
+bool PaddlePredictorImpl::Init() {
+  VLOG(3) << "Predictor::init()";
+
+  // TODO(panyx0718): Should CPU vs GPU device be decided by id?
+  if (config_.device >= 0) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  paddle::framework::InitDevices(false);
+  executor_.reset(new paddle::framework::Executor(place_));
+  scope_.reset(new paddle::framework::Scope());
+
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+
+  // Create variables
+  // TODO(panyx0718): Why need to test share_variables here?
+  if (config_.share_variables) {
+    executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+  }
+  // Get the feed_target_names and fetch_target_names
+  feed_target_names_ = inference_program_->GetFeedTargetNames();
+  fetch_target_names_ = inference_program_->GetFetchTargetNames();
+  return true;
+}
+
+bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
+                              std::vector<PaddleTensor> *output_data) {
+  VLOG(3) << "Predictor::predict";
+  Timer timer;
+  timer.tic();
+  // set feed variable
+  std::map<std::string, const paddle::framework::LoDTensor *> feed_targets;
+  std::vector<paddle::framework::LoDTensor> feeds;
+  if (!SetFeed(inputs, &feeds)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    feed_targets[feed_target_names_[i]] = &feeds[i];
+  }
+  // get fetch variable
+  std::map<std::string, paddle::framework::LoDTensor *> fetch_targets;
+  std::vector<paddle::framework::LoDTensor> fetchs;
+  fetchs.resize(fetch_target_names_.size());
+  for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
+    fetch_targets[fetch_target_names_[i]] = &fetchs[i];
+  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->RunPreparedContext(ctx_.get(),
+                                scope_.get(),
+                                &feed_targets,
+                                &fetch_targets,
+                                !config_.share_variables);
+  if (!GetFetch(fetchs, output_data)) {
+    LOG(ERROR) << "fail to get fetchs";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> PaddlePredictorImpl::Clone() {
+  VLOG(3) << "Predictor::clone";
+  std::unique_ptr<PaddlePredictorImpl> cls(new PaddlePredictorImpl(config_));
+  if (!cls->InitShared(this)) {
+    LOG(ERROR) << "fail to call InitShared";
+    return nullptr;
+  }
+  return cls;
+}
+
+// TODO(panyx0718): Consider merge with Init()?
+bool PaddlePredictorImpl::InitShared(PaddlePredictorImpl *cls) {
+  VLOG(3) << "Predictor::init_shared";
+  // 1. Define place, executor, scope
+  if (this->config_.device >= 0) {
+    place_ = paddle::platform::CUDAPlace();
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  this->executor_.reset(new paddle::framework::Executor(this->place_));
+  this->scope_.reset(new paddle::framework::Scope());
+  // Initialize the inference program
+  if (!this->config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    this->inference_program_ = paddle::inference::Load(
+        this->executor_.get(), this->scope_.get(), this->config_.model_dir);
+  } else if (!this->config_.prog_file.empty() &&
+             !this->config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    this->inference_program_ =
+        paddle::inference::Load(this->executor_.get(),
+                                this->scope_.get(),
+                                this->config_.prog_file,
+                                this->config_.param_file);
+  }
+  this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
+  // 3. create variables
+  // TODO(panyx0718): why test share_variables.
+  if (config_.share_variables) {
+    this->executor_->CreateVariables(
+        *this->inference_program_, this->scope_.get(), 0);
+  }
+  // 4. Get the feed_target_names and fetch_target_names
+  this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
+  this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
+  return true;
+}
+
+bool PaddlePredictorImpl::SetFeed(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<paddle::framework::LoDTensor> *feeds) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feed_target_names_.size()) {
+    LOG(ERROR) << "wrong feed input size.";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    paddle::framework::LoDTensor input;
+    paddle::framework::DDim ddim =
+        paddle::framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr =
+          input.mutable_data<int64_t>(ddim, paddle::platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, paddle::platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr),
+                inputs[i].data.data,
+                inputs[i].data.length);
+    feeds->push_back(input);
+    LOG(ERROR) << "Actual feed type " << feeds->back().type().name();
+  }
+  return true;
+}
+
+bool PaddlePredictorImpl::GetFetch(
+    const std::vector<paddle::framework::LoDTensor> &fetchs,
+    std::vector<PaddleTensor> *outputs) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs.size());
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    // TODO(panyx0718): Support fetch of other types.
+    if (fetchs[i].type() != typeid(float)) {
+      LOG(ERROR) << "only support fetching float now.";
+      return false;
+    }
+    std::vector<int> shape;
+    auto dims_i = fetchs[i].dims();
+    auto lod = fetchs[i].lod();
+    const float *output_ptr = fetchs[i].data<float>();
+    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
+    auto num = fetchs[i].numel();
+    std::vector<float> data;
+    if (0 == lod.size()) {
+      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+      for (int j = 0; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    } else {
+      // for batch detection
+      // image[0] -> output[0] shape {145, 6}
+      // image[1] -> output[1] shape {176, 6}
+      // then,
+      // the batch output shape {321, 6}
+      // the lod {{0, 145, 321}}
+      // so we should append output[0] to {176, 6}
+      size_t max_dim = 0;
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+      }
+      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+      if (max_dim > 0) {
+        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+      }
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        size_t start = lod[0][j - 1] * common_dim;
+        size_t end = lod[0][j] * common_dim;
+        if (end > start) {
+          std::copy(output_ptr + start,
+                    output_ptr + end,
+                    data.begin() + (j - 1) * max_dim * common_dim);
+        }
+      }
+      shape.push_back(lod[0].size() - 1);
+      shape.push_back(max_dim);
+      for (int j = 1; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    }
+
+    outputs->at(i).shape = shape;
+    outputs->at(i).data.length = sizeof(float) * data.size();
+    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
+    std::memcpy(
+        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    outputs->at(i).dtype = PaddleDType::FLOAT32;
+    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
+  }
+  return true;
+}
+
+std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
+    const VisConfig &config) {
+  VLOG(3) << "create PaddlePredictorImpl";
+  // 1. GPU memeroy
+  std::vector<std::string> flags;
+  if (config.fraction_of_gpu_memory >= 0.0f ||
+      config.fraction_of_gpu_memory <= 0.95f) {
+    flags.push_back("dummpy");
+    std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                       num2str<float>(config.fraction_of_gpu_memory);
+    flags.push_back(flag);
+    VLOG(3) << "set flag: " << flag;
+    framework::InitGflags(flags);
+  }
+
+  std::unique_ptr<PaddlePredictorImpl> predictor(
+      new PaddlePredictorImpl(config));
+  if (!predictor->Init()) {
+    return nullptr;
+  }
+  return predictor;
+}
+
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+struct VisConfig : public PaddlePredictor::Config {
+  int device;
+  float fraction_of_gpu_memory;
+  std::string prog_file;
+  std::string param_file;
+  bool share_variables;
+};
+
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class PaddlePredictorImpl : public PaddlePredictor {
+public:
+  explicit PaddlePredictorImpl(const VisConfig &config) : config_(config) {}
+
+  bool Init();
+
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  ~PaddlePredictorImpl() override{};
+
+private:
+  bool InitShared(PaddlePredictorImpl *cls);
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               std::vector<paddle::framework::LoDTensor> *feeds);
+  bool GetFetch(const std::vector<paddle::framework::LoDTensor> &fetchs,
+                std::vector<PaddleTensor> *output_data);
+
+  VisConfig config_;
+  paddle::platform::Place place_;
+  std::unique_ptr<paddle::framework::Executor> executor_;
+  std::unique_ptr<paddle::framework::Scope> scope_;
+  std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx_;
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program_;
+  std::vector<std::string> feed_target_names_;
+  std::vector<std::string> fetch_target_names_;
+};
+
+std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
+    const VisConfig &config);
+
+}  // namespace paddle
--- a/paddle/contrib/inference/test_paddle_inference_api.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+/*
+ * Do not use this, just a demo indicating how to customize a config for a
+ * specific predictor.
+ */
+struct DemoConfig : public PaddlePredictor::Config {
+  float other_config;
+};
+
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class DemoPredictor : public PaddlePredictor {
+public:
+  explicit DemoPredictor(const DemoConfig &config) {
+    LOG(INFO) << "I get other_config " << config.other_config;
+  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override {
+    LOG(INFO) << "Run";
+    return false;
+  }
+
+  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
+
+  ~DemoPredictor() override {}
+};
+
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
+    const DemoConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
+  return x;
+}
+
+TEST(paddle_inference_api, demo) {
+  DemoConfig config;
+  config.other_config = 1.7;
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> outputs;
+  predictor->Run({}, &outputs);
+}
+
+}  // namespace paddle
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+namespace paddle {
+
+PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
+  PaddleTensor pt;
+  pt.data.data = t->data<void>();
+
+  if (t->type() == typeid(int64_t)) {
+    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.dtype = PaddleDType::INT64;
+  } else if (t->type() == typeid(float)) {
+    pt.data.length = t->numel() * sizeof(float);
+    pt.dtype = PaddleDType::FLOAT32;
+  } else {
+    LOG(FATAL) << "unsupported type.";
+  }
+  pt.shape = framework::vectorize2int(t->dims());
+  return pt;
+}
+
+TEST(paddle_inference_api_impl, word2vec) {
+  VisConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.85;
+  config.device = 0;
+  config.share_variables = true;
+
+  std::unique_ptr<PaddlePredictorImpl> predictor =
+      CreatePaddlePredictorImpl(config);
+
+  framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2073;  // The size of dictionary
+
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+
+  std::vector<PaddleTensor> cpu_feeds;
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&first_word));
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&second_word));
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&third_word));
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(cpu_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    size_t len = outputs[i].data.length;
+    float* data = static_cast<float*>(outputs[i].data.data);
+    for (int j = 0; j < len / sizeof(float); ++j) {
+      ASSERT_LT(data[j], 1.0);
+      ASSERT_GT(data[j], -1.0);
+    }
+    free(outputs[i].data.data);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -243,13 +243,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
 }

 void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
+  RenameInput(old_name, new_name);
+  RenameOutput(old_name, new_name);
  need_update_ = true;
 }

@@ -274,6 +269,13 @@ void OpDesc::RenameInput(const std::string &old_name,
  for (auto &input : inputs_) {
    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
  }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
  need_update_ = true;
 }


--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -63,6 +63,7 @@ class InferShapeContext {

  std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
  std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
+  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;

  // Note: In while op, we need this to be public
  void SetDims(const std::vector<std::string> &names,
@@ -81,8 +82,6 @@ class InferShapeContext {
      const std::vector<std::string> &names) const;

  virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
-
-  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 };

 }  // namespace framework

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)

+# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
-nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
-    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
-    SERIAL)
+# This test is not stable
+# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 
+#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
+#    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
+#    SERIAL)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -149,12 +149,14 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  }

  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
    // GPU data is copied to CPU buffer when sending,
    // free the buffer when possible.
    destroy_callback = [](void* backing) {
      platform::CUDAPinnedPlace cuda_pinned;
      memory::Free(cuda_pinned, backing);
    };
+#endif
  }

  std::string header;

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -24,6 +24,8 @@ detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
+detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
+    polygon_box_transform_op.cu)

 # Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channel = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int id = 0;
+    for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+      for (int id_h = 0; id_h < height; ++id_h) {
+        for (int id_w = 0; id_w < width; ++id_w) {
+          id = id_n * height * width + width * id_h + id_w;
+          if (id_n % 2 == 0) {
+            out_data[id] = id_w - in_data[id];
+          } else {
+            out_data[id] = id_h - in_data[id];
+          }
+        }
+      }
+    }
+  }
+};
+
+class PolygonBoxTransformOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input (Input) of polygon_box transform op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Output"),
+        "Output (Output) of polygon_box transform op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("Input");
+
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
+    PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
+                      "input's second dimension must be even.");
+
+    ctx->SetOutputDim("Output", in_dim);
+  }
+};
+
+class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "The input with shape [batch_size, geometry_channels, height, width]");
+    AddOutput("Output", "The output with the same shape as input");
+
+    AddComment(R"DOC(
+PolygonBoxTransform Operator.
+The input is the final geometry output in detection network.
+We use 2*n numbers to denote the coordinate shift from n corner vertices of
+the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
+the geometry output contains 2*n channels.
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
+                  ops::PolygonBoxTransformOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    polygon_box_transform,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_BLOCK_SIZE 16
+
+template <typename T>
+__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
+                                          const T* input, T* output) {
+  int id_n = threadIdx.x + blockDim.x * blockIdx.x;
+  int id_h = threadIdx.y + blockDim.y * blockIdx.y;
+  int id_w = threadIdx.z + blockDim.z * blockIdx.z;
+  if (id_n < n && id_h < h && id_w < w) {
+    int id = id_n * h * w + w * id_h + id_w;
+    if (id_n % 2 == 0) {
+      output[id] = id_w - input[id];
+    } else {
+      output[id] = id_h - input[id];
+    }
+  }
+}
+
+template <typename T>
+class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    dim3 threadsPerBlock(
+        PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
+        CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
+    dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
+                   (height + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (width + threadsPerBlock.z - 1) / threadsPerBlock.z);
+    auto stream = ctx.cuda_device_context().stream();
+    PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
+        batch_size * geo_channels, height, width, in_data, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    polygon_box_transform,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("Input");
+    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size for the LoDTensor.
+      auto odims = out->dims();
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
    out->mutable_data<T>(ctx.GetPlace());
    auto value = ctx.Attr<float>("value");


--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -46,7 +46,10 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {

      const int64_t* label_data = labels->data<int64_t>();
      for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
+        int lbl = label_data[i];
+        PADDLE_ENFORCE_GE(lbl, 0);
+        PADDLE_ENFORCE_LT(lbl, class_num);
+        int index = i * class_num + lbl;
        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
      }
    }

--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -23,6 +23,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
+reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)

 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent

--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class CustomReader : public framework::DecoratedReader {
+ public:
+  CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
+               const platform::Place& dev_place,
+               const std::vector<std::string>& source_var_names,
+               const std::vector<std::string>& sink_var_names)
+      : DecoratedReader(reader),
+        program_(*sub_block.Program()),
+        sub_block_id_(sub_block.ID()),
+        exe_(framework::Executor(dev_place)),
+        source_var_names_(source_var_names),
+        sink_var_names_(sink_var_names) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  const framework::ProgramDesc program_;
+  int sub_block_id_;
+  framework::Executor exe_;
+
+  std::vector<std::string> source_var_names_;
+  std::vector<std::string> sink_var_names_;
+};
+
+class CreateCustomReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(
+        new CustomReader(underlying_reader.Get(), *sub_block, dev_place,
+                         Attr<std::vector<std::string>>("source_var_names"),
+                         Attr<std::vector<std::string>>("sink_var_names")));
+  }
+};
+
+class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<framework::BlockDesc*>(
+        "sub_block", "The block to hold all preprocessing operators.");
+    AddAttr<std::vector<std::string>>(
+        "source_var_names",
+        "Source variables are starting points of data preprocessing. They hold "
+        "preprocessing's input tensors. Each source variable corresponds to "
+        "one of underlying reader's output datas.");
+    AddAttr<std::vector<std::string>>(
+        "sink_var_names",
+        "Sink variables are ending points of data preprocessing. They hold "
+        "preprocessing's output tensors. Each sink variable corresponds to "
+        "one of custom reader's output datas.");
+    AddComment(R"DOC(
+      CreateCustomReader Operator
+
+      A custom reader can be used for input data preprocessing. 
+      A custom reader holds its own sub-block, which will be executed in its 
+      'ReadNext()' function. Users can configurate their own preprocessing 
+      pipelines by inserting operators into custom reader's sub-block.
+    )DOC");
+  }
+};
+
+class CustomReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(!ctx->IsRuntime(),
+                   "'CustomReaderInferShape' should only be invoked during "
+                   "compile time.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    const auto* sub_block =
+        ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
+    const auto sink_var_names =
+        ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
+    std::vector<std::vector<int64_t>> res_dims;
+    std::vector<int32_t> res_lod_levels;
+    for (const std::string& var_name : sink_var_names) {
+      auto* sink_var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(sink_var);
+      res_dims.emplace_back(sink_var->GetShape());
+      res_lod_levels.push_back(sink_var->GetLoDLevel());
+    }
+    auto* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetShapes(res_dims);
+    out_reader->SetLoDLevels(res_lod_levels);
+  }
+};
+
+class CustomReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
+    PADDLE_ENFORCE_NOT_NULL(out_reader);
+    out_reader->SetType(framework::proto::VarType::READER);
+
+    auto sink_var_names =
+        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+    const auto* sub_block =
+        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+    std::vector<framework::proto::VarType::Type> res_data_types;
+    for (const std::string& var_name : sink_var_names) {
+      framework::VarDesc* var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      res_data_types.emplace_back(var->GetDataType());
+    }
+    out_reader->SetDataTypes(res_data_types);
+  }
+};
+
+void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  std::vector<framework::LoDTensor> underlying_outs;
+  reader_->ReadNext(&underlying_outs);
+  if (underlying_outs.empty()) {
+    // There is not next data.
+    return;
+  }
+  PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
+                 "The size of source_var_names(%d) and the size of "
+                 "underlying_outs(%d) are not consistent. Each feeding element "
+                 "must have its own source variable.",
+                 source_var_names_.size(), underlying_outs.size());
+  // The scope for CustomReader's sub-block should be independent and shouldn't
+  // be any other computation scope's child. Otherwise, data preprocessing and
+  // compution cannot be concurrent.
+  framework::Scope scope;
+  // 1. Copy LoDTensors from underlying reader's output to source variables.
+  for (size_t i = 0; i < source_var_names_.size(); ++i) {
+    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+    tensor->ShareDataWith(underlying_outs[i]);
+    tensor->set_lod(underlying_outs[i].lod());
+  }
+  // 2. Run the sub-block.
+  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  // 3. Copy LoDTensors from sink variables to out.
+  out->resize(sink_var_names_.size());
+  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
+    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+                             .Get<framework::LoDTensor>();
+    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
+  }
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
+                  ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
+                  ops::CustomReaderInferVarType,
+                  paddle::framework::EmptyGradOpMaker)
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -115,6 +115,7 @@ void DecoratedReaderInferShape::operator()(
      boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
  out_reader->SetLoDLevels(in_reader->GetLoDLevels());
 }
+
 void DecoratedReaderInferVarType::operator()(
    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];

--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <glog/logging.h>
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/function/EigenThreadDevice.h"

 namespace paddle {

@@ -70,25 +70,26 @@ struct EigenBlasGemm {
    dims[0].first = transA ? 0 : 1;
    dims[0].second = transB ? 1 : 0;

-    Eigen::DefaultDevice device;
+    auto* device = EigenDeviceWarpper::device();
    if (N == ldc) {
      if (alpha == T(1) && beta == T(0)) {
-        c.device(device) = a.contract(b, dims);
+        c.device(*device) = a.contract(b, dims);
      } else if (alpha == T(1) && beta == T(1)) {
-        c.device(device) += a.contract(b, dims);
+        c.device(*device) += a.contract(b, dims);
      } else {
-        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
      }
    } else {
      if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
      } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
      } else {
-        c.slice(offsetC, extentC).device(device) =
+        c.slice(offsetC, extentC).device(*device) =
            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
      }
    }
+    EigenDeviceWarpper::free_device(device);
  }
 };


--- a/paddle/function/EigenThreadDevice.h
+++ b/paddle/function/EigenThreadDevice.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#pragma once
+
+#if defined(__OSX__) || defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+#if defined(__ANDROID__)
+int GetCpuCount() {
+  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
+  if (!fp) {
+    return 1;
+  }
+  int rank0, rank1;
+  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
+  fclose(fp);
+  if (num < 2) return 1;
+  return rank1 + 1;
+}
+#elif defined(__OSX__) || defined(__APPLE__)
+int GetCpuCount() {
+  int count = 0;
+  size_t len = sizeof(int);
+  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+  return count > 0 ? count : 1;
+}
+#else
+int GetCpuCount() { return 1; }
+#endif
+
+class EigenDeviceWarpper {
+public:  // NOLINT
+#if EIGEN_USE_THREADS
+  static Eigen::ThreadPoolDevice* device() {
+    const int num_cpus = GetCpuCount();
+    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
+    static Eigen::ThreadPool tp(num_threads);
+    static Eigen::ThreadPoolDevice* device =
+        new Eigen::ThreadPoolDevice(&tp, num_threads);
+    return device;
+  }
+
+  static void free_device(Eigen::ThreadPoolDevice* device) {
+    // do nothing
+  }
+#else
+  static Eigen::DefaultDevice* device() {
+    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
+    return device;
+  }
+
+  static void free_device(Eigen::DefaultDevice* device) { delete device; }
+#endif
+};
+
+}  // namespace paddle
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -7,6 +7,10 @@ set(OPITMIZER_SRCS
    sgd_optimizer.cc
  )

-cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
-cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
-cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
+add_library(paddle_optimizer ${OPITMIZER_SRCS})
+target_link_libraries(paddle_optimizer paddle_proto glog)
+
+if (WITH_TESTING)
+    add_unittest(serialization_test serialization_test.cc)
+    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
+endif()
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
-#!/bin/bash
-
-function cmake_gen() {
-    mkdir -p /paddle/build
-    cd /paddle/build
-
-    # build script will not fail if *.deb does not exist
-    rm *.deb 2>/dev/null || true
-    # delete previous built whl packages
-    rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-    # Support build for all python versions, currently
-    # including cp27-cp27m and cp27-cp27mu.
-    PYTHON_FLAGS=""
-    if [ "$1" != "" ]; then
-        echo "using python abi: $1"
-        if [ "$1" == "cp27-cp27m" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-        elif [ "$1" == "cp27-cp27mu" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-        fi
-    fi
-
-    cat <<EOF
-    ========================================
-    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-        ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
-        -DWITH_DOC=${WITH_DOC:-OFF}
-        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
-        -DWITH_C_API=${WITH_C_API:-OFF}
-        -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-        -DCUDNN_ROOT=/usr/
-        -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-    ========================================
-EOF
-    # Disable UNITTEST_USE_VIRTUALENV in docker because
-    # docker environment is fully controlled by this script.
-    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
-        ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
-        -DWITH_DOC=${WITH_DOC:-OFF} \
-        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
-        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-        -DCUDNN_ROOT=/usr/ \
-        -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-}
-
-function run_build() {
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    make clean
-    make -j `nproc`
-}
-
-function run_test() {
-    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        ctest --output-on-failure
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
-    fi
-}
-
-
-function gen_docs() {
-    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
-        cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build_doc
-    ========================================
-EOF
-        mkdir -p /paddle/build_doc
-        pushd /paddle/build_doc
-        cmake .. \
-            -DWITH_DOC=ON \
-            -DWITH_GPU=OFF \
-            -DWITH_AVX=${WITH_AVX:-ON} \
-            -DWITH_SWIG_PY=ON
-
-        make -j `nproc` paddle_docs paddle_apis
-        popd
-    fi
-
-
-    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
-        cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-        export WOBOQ_OUT=/paddle/build/woboq_out
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
-            -b /paddle/build \
-            -a \
-            -o $WOBOQ_OUT \
-            -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-    fi
-}
-
-
-function gen_dockerfile() {
-    # Set BASE_IMAGE according to env variables
-    if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
-    else
-    BASE_IMAGE="ubuntu:16.04"
-    fi
-
-    DOCKERFILE_GPU_ENV=""
-    DOCKERFILE_CUDNN_DSO=""
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-    fi
-
-    cat <<EOF
-    ========================================
-    Generate /paddle/build/Dockerfile ...
-    ========================================
-EOF
-
-    cat > /paddle/build/Dockerfile <<EOF
-    FROM ${BASE_IMAGE}
-    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-    ENV HOME /root
-EOF
-
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
-    else
-        NCCL_DEPS=""
-    fi
-
-    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
-        PADDLE_VERSION="paddle version"
-        CMD='"paddle", "version"'
-    else
-        PADDLE_VERSION="true"
-        CMD='"true"'
-    fi
-
-    cat >> /paddle/build/Dockerfile <<EOF
-    ADD python/dist/*.whl /
-    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
-        pip install /*.whl; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f /*.whl && \
-        ${PADDLE_VERSION} && \
-        ldconfig
-    ${DOCKERFILE_CUDNN_DSO}
-    ${DOCKERFILE_GPU_ENV}
-    ENV NCCL_LAUNCH_MODE PARALLEL
-EOF
-    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD go/cmd/pserver/pserver /usr/bin/
-        ADD go/cmd/master/master /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
-    # default command shows the paddle version and exit
-    CMD [${CMD}]
-EOF
-}
-
-function gen_capi_package() {
-  if [[ ${WITH_C_API} == "ON" ]]; then
-    install_prefix="/paddle/build/capi_output"
-    rm -rf $install_prefix
-    make DESTDIR="$install_prefix" install
-    cd $install_prefix/usr/local
-    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
-  fi
-}
-
-function gen_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
-    cat <<EOF
-    ========================================
-    Deploying fluid inference library ...
-    ========================================
-EOF
-        make -j `nproc` inference_lib_dist
-    fi
-}
-
-set -xe
-
-cmake_gen ${PYTHON_ABI:-""}
-run_build
-run_test
-gen_docs
-gen_dockerfile
-gen_capi_package
-gen_fluid_inference_lib
-
-if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
-  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
-else
-  printf "If you need to install PaddlePaddle in develop docker image,"
-  printf "please make install or pip install build/python/dist/*.whl.\n"
-fi
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
-#!/bin/bash
-
-set -xe
-
-if [ $ANDROID_ABI == "arm64-v8a" ]; then
-  ANDROID_ARCH=arm64
-  if [ $ANDROID_API -lt 21 ]; then
-    echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-    ANDROID_API=21
-  fi
-else # armeabi, armeabi-v7a
-  ANDROID_ARCH=arm
-fi
-
-ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-
-cat <<EOF
-============================================
-Generating the standalone toolchain ...
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-      --arch=$ANDROID_ARCH
-      --platform=android-$ANDROID_API
-      --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-============================================
-EOF
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-      --arch=$ANDROID_ARCH \
-      --platform=android-$ANDROID_API \
-      --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-
-BUILD_ROOT=/paddle/build_android
-DEST_ROOT=/paddle/install_android
-
-mkdir -p $BUILD_ROOT
-cd $BUILD_ROOT
-
-if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_NEON=ON \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=ON \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=OFF \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-elif [ $ANDROID_ABI == "armeabi" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-else
-  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-fi
-
-cat <<EOF
-============================================
-Building in $BUILD_ROOT ...
-============================================
-EOF
-make -j `nproc`
-make install -j `nproc`
--- a/paddle/scripts/docker/entrypoint
+++ b/paddle/scripts/docker/entrypoint
-#!/bin/bash
-
-/usr/sbin/sshd -D &
-jupyter notebook --ip=0.0.0.0 /paddle/book/
--- a/paddle/scripts/docker/test.sh
+++ b/paddle/scripts/docker/test.sh
-#!/bin/bash
-set -e
-
-# the number of process to run tests
-NUM_PROC=6
-
-# calculate and set the memory usage for each process
-MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
-export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-
-# get the CUDA device count
-CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-
-for (( i = 0; i < $NUM_PROC; i++ )); do
-    cuda_list=()
-    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
-        s=$[i+j]
-        n=$[s%CUDA_DEVICE_COUNT]
-        if [ $j -eq 0 ]; then
-            cuda_list=("$n")
-        else
-            cuda_list="$cuda_list,$n"
-        fi
-    done
-    echo $cuda_list
-    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
-done
-wait
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -104,6 +104,8 @@ function cmake_gen() {
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_CONTRIB=ON
    ========================================
 EOF
    # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -129,7 +131,8 @@ EOF
        -DWITH_FAST_BUNDLE_TEST=ON \
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+        -DWITH_CONTRIB=ON
 }

 function abort(){

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build
-cd $TRAVIS_BUILD_DIR/build
-
-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
-
-make -j `nproc` paddle_docs paddle_apis
-
-# check websites for broken links
-linkchecker doc/v2/en/html/index.html
-linkchecker doc/v2/cn/html/index.html
-linkchecker doc/v2/api/en/html/index.html
--- a/paddle/scripts/travis/build_ios.sh
+++ b/paddle/scripts/travis/build_ios.sh
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_ios
-cd $TRAVIS_BUILD_DIR/build_ios
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=iOS \
-      -DIOS_PLATFORM=OS \
-      -DCMAKE_OSX_ARCHITECTURES="arm64" \
-      -DWITH_C_API=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_TESTING=OFF \
-      -DWITH_SWIG_PY=OFF \
-      -DCMAKE_BUILD_TYPE=Release \
-      ..
-
-make -j 2
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-
-# install glide
-curl https://glide.sh/get | bash
-eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-go get github.com/alecthomas/gometalinter
-gometalinter --install
-
-cd $TRAVIS_BUILD_DIR
-export PATH=/usr/bin:$PATH
-pre-commit install
-clang-format --version
-
-
-
-if ! pre-commit run -a ; then
-    git diff
-    exit 1
-fi
-
-trap : 0
--- a/paddle/scripts/travis/deploy_key.enc
+++ b/paddle/scripts/travis/deploy_key.enc
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib

 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
@@ -21,7 +22,8 @@ from ..executor import global_scope

 __all__ = [
    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
+    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
+    'random_data_generator', 'Preprocessor'
 ]


@@ -535,8 +537,6 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
        inputs={'UnderlyingReader': reader},
        outputs={'Out': [new_reader]},
        attrs=attrs)
-    new_reader.persistable = True
-    new_reader.stop_gradient = True
    return monkey_patch_reader_methods(new_reader)


@@ -581,3 +581,82 @@ def read_file(file_obj):
        return out[0]
    else:
        return out
+
+
+class Preprocessor(object):
+    BEFORE_SUB_BLOCK = 0
+    IN_SUB_BLOCK = 1
+    AFTER_SUB_BLOCK = 2
+
+    def __init__(self, reader, name=None):
+        self.underlying_reader = reader
+        new_reader_name = name if name is not None else unique_name(
+            "create_custom_reader")
+        self.main_prog = default_main_program()
+        self.reader = self.main_prog.current_block().create_var(
+            name=new_reader_name)
+        self.sub_block = None
+        self.source_var_names = None
+        self.sink_var_names = None
+        self.status = Preprocessor.BEFORE_SUB_BLOCK
+
+    def is_completed(self):
+        return self.sub_block and self.source_var_names and self.sink_var_names
+
+    @contextlib.contextmanager
+    def block(self):
+        self.status = Preprocessor.IN_SUB_BLOCK
+        self.sub_block = self.main_prog.create_block()
+        yield
+        self.main_prog.rollback()
+        self.status = Preprocessor.AFTER_SUB_BLOCK
+        if not self.is_completed():
+            raise RuntimeError(
+                "The definition of preprocessor is incompleted! "
+                "Please make sure that you have set input and output "
+                "variables by invoking 'inputs' and 'outputs' in "
+                "Preprocessor's sub-block.")
+
+    def inputs(self):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.inputs() can only be invoked inside the sub-block."
+            )
+
+        source_shapes = self.underlying_reader.desc.shapes()
+        source_dtypes = self.underlying_reader.desc.dtypes()
+        source_lod_levels = self.underlying_reader.desc.lod_levels()
+        self.source_var_names = [
+            unique_name("preprocessor_source")
+            for _ in xrange(len(source_shapes))
+        ]
+        source_vars = []
+        for var_name, shape, dtype, lod_level in zip(
+                self.source_var_names, source_shapes, source_dtypes,
+                source_lod_levels):
+            source_vars.append(self.main_prog.current_block().create_var(
+                name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
+        return source_vars
+
+    def outputs(self, *outs):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.outputs() can only be invoked inside the sub-block."
+            )
+        self.sink_var_names = [var.name for var in outs]
+
+    def __call__(self, *args, **kwargs):
+        if self.status != Preprocessor.AFTER_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor output can only be retrieved after rnn block.")
+
+        self.main_prog.current_block().append_op(
+            type="create_custom_reader",
+            inputs={'UnderlyingReader': self.underlying_reader},
+            outputs={'Out': [self.reader]},
+            attrs={
+                "sub_block": self.sub_block,
+                "source_var_names": self.source_var_names,
+                "sink_var_names": self.sink_var_names
+            })
+        return monkey_patch_reader_methods(self.reader)
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -24,19 +24,65 @@ from tensor import concat
 import utils

 __all__ = [
-    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
-    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
-    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d',
-    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm',
-    'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit',
-    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
-    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
-    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
-    'beam_search', 'row_conv', 'multiplex', 'layer_norm',
-    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
-    'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
-    'label_smooth', 'roi_pool', 'dice_loss', 'bilinear_interp', 'random_crop'
+    'fc',
+    'embedding',
+    'dynamic_lstm',
+    'dynamic_lstmp',
+    'dynamic_gru',
+    'gru_unit',
+    'linear_chain_crf',
+    'crf_decoding',
+    'cos_sim',
+    'cross_entropy',
+    'square_error_cost',
+    'chunk_eval',
+    'sequence_conv',
+    'conv2d',
+    'sequence_pool',
+    'sequence_softmax',
+    'softmax',
+    'pool2d',
+    'batch_norm',
+    'beam_search_decode',
+    'conv2d_transpose',
+    'sequence_expand',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'reduce_prod',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
+    'split',
+    'ctc_greedy_decoder',
+    'edit_distance',
+    'l2_normalize',
+    'matmul',
+    'topk',
+    'warpctc',
+    'sequence_reshape',
+    'transpose',
+    'im2sequence',
+    'nce',
+    'beam_search',
+    'row_conv',
+    'multiplex',
+    'layer_norm',
+    'softmax_with_cross_entropy',
+    'smooth_l1',
+    'one_hot',
+    'autoincreased_step_counter',
+    'reshape',
+    'lod_reset',
+    'lrn',
+    'pad',
+    'label_smooth',
+    'roi_pool',
+    'dice_loss',
+    'upsampling_bilinear2d',
+    'random_crop',
 ]


@@ -3881,8 +3927,10 @@ def dice_loss(input, label, epsilon=0.00001):
    return reduce_mean(dice_score)


-def bilinear_interp(input, out_h, out_w, name=None):
+def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
    """
+    The mathematical meaning of upsampling_bilinear2d is also called
+    Bilinear interpolation.
    Bilinear interpolation is an extension of linear interpolation for
    interpolating functions of two variables (e.g. H-direction and
    W-direction in this layer) on a rectilinear 2D grid.
@@ -3894,8 +3942,13 @@ def bilinear_interp(input, out_h, out_w, name=None):
        input (Variable): The input tensor of bilinear interpolation,
                          This is a 4-D tensor of the shape
                          (num_batches, channels, in_h, in_w).
-        out_h (int): output height of bilinear interpolation layer.
-        out_w (int): output width of bilinear interpolation layer.
+        out_shape(list|tuple|None): Output shape of bilinear interpolation
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
+        scale(int|None): The multiplier for the input height or width.
+                         At least one of out_shape or scale must be set.
+                         And out_shape has a higher priority than scale.
+                         Default: None
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.

@@ -3906,10 +3959,27 @@ def bilinear_interp(input, out_h, out_w, name=None):
    Examples:
        .. code-block:: python

-            out = fluid.layers.bilinear_interp(input, out_h=12, out_w=12)
+            out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
    """
+    if out_shape is None and scale is None:
+        raise ValueError("One of out_shape and scale must not be None")
    helper = LayerHelper('bilinear_interp', **locals())
    dtype = helper.input_dtype()
+
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if out_shape is not None:
+        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2):
+            raise ValueError('out_shape should be a list or tuple ',
+                             'with length 2, (out_h, out_w).')
+        out_shape = list(map(int, out_shape))
+        out_h = out_shape[0]
+        out_w = out_shape[1]
+    else:
+        out_h = int(input.shape[2] * scale)
+        out_w = int(input.shape[3] * scale)
+
    out = helper.create_tmp_variable(dtype)
    helper.append_op(
        type="bilinear_interp",

--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -93,12 +93,12 @@ def _convert_lod(lod):


 def create_lod_tensor(data, lod, place):
-    """Create a lod tensor from a numpy array or an existing lod tensor.
+    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.

    Create a lod tensor by doing the following:
    1. Check that the length-based input lod is valid.
    2. Convert the length-based lod to a offset-based LoD.
-    3. Copy the data from a numpy array or a existing lod tensor to 
+    3. Copy the data from a numpy array, a list or a existing lod tensor to 
       CPU or GPU device (based on input place).
    4. Set the level of detail (LoD) using the offset-based LoD.
    
@@ -117,7 +117,7 @@ def create_lod_tensor(data, lod, place):
    for more details regarding LoD.

    Args:
-        data: a numpy array or a LoDTensor holding the data to be copied.
+        data: a numpy array or a LoDTensor or a list holding the data to be copied.
        lod: a list of lists indicating the length-based LoD info specified by the user. 
        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.

@@ -126,6 +126,18 @@ def create_lod_tensor(data, lod, place):
    """
    if isinstance(data, core.LoDTensor):
        return create_lod_tensor(np.array(data), lod, place)
+    elif isinstance(data, list):
+        # When input data is a list, it only deal with the case where the base element 
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
+        # of words or other indexes in the sequence. 
+        new_lod = []
+        for seq in data:
+            new_lod.append(len(seq))
+        assert [new_lod] == lod, "data and lod do not match"
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        return create_lod_tensor(flattened_data, lod, place)
    elif isinstance(data, np.ndarray):
        assert _validate_lod(lod,
                             data.shape[0]), "the provided lod info is invalid"
@@ -134,9 +146,8 @@ def create_lod_tensor(data, lod, place):
        tensor.set_lod(_convert_lod(lod))
        return tensor
    else:
-        raise Exception(
-            "data should be either a LoDTensor or a Numpy array, but you pass type %s instead"
-            % (type(data)))
+        raise TypeError(
+            "data should be either a LoDTensor, a Numpy array or a list")


 def create_random_int_lodtensor(lod, base_shape, place, low, high):

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -48,7 +48,7 @@ def linear():
    return avg_loss


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    trainer = fluid.Trainer(
@@ -68,8 +68,8 @@ def train(use_cuda, train_program, save_dirname):
                ['15.343549569447836']
                ...
                '''
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                trainer.stop()

    trainer.train(
@@ -80,13 +80,13 @@ def train(use_cuda, train_program, save_dirname):


 # infer
-def infer(use_cuda, inference_program, save_dirname=None):
-    if save_dirname is None:
+def infer(use_cuda, inference_program, params_dirname=None):
+    if params_dirname is None:
        return

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)

    batch_size = 10
    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
@@ -100,10 +100,10 @@ def main(use_cuda):
        return

    # Directory for saving the trained model
-    save_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.inference.model"

-    train(use_cuda, linear, save_dirname)
-    infer(use_cuda, inference_program, save_dirname)
+    train(use_cuda, linear, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)


 class TestFitALine(unittest.TestCase):

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -85,7 +85,7 @@ def train_network():
    return [avg_cost, accuracy]


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    BATCH_SIZE = 128
    EPOCH_NUM = 1

@@ -105,8 +105,8 @@ def train(use_cuda, train_program, save_dirname):
            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))

            if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                return

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -122,10 +122,10 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['pixel', 'label'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)

    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -142,12 +142,14 @@ def main(use_cuda):
    save_path = "image_classification_resnet.inference.model"

    train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)

    infer(
        use_cuda=use_cuda,
        inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -64,7 +64,7 @@ def train_network():
    return [avg_cost, accuracy]


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    BATCH_SIZE = 128
    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -82,8 +82,8 @@ def train(use_cuda, train_program, save_dirname):
            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))

            if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                return

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -99,10 +99,10 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['pixel', 'label'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)

    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -119,12 +119,14 @@ def main(use_cuda):
    save_path = "image_classification_vgg.inference.model"

    train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)

    infer(
        use_cuda=use_cuda,
        inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -141,7 +141,7 @@ def train_program():
    return [avg_cost]


-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.SGD(learning_rate=0.01)

@@ -172,7 +172,7 @@ def train(use_cuda, train_program, save_path):
            print("avg_cost: %s" % avg_cost)

            if float(avg_cost) < 100.0:  # Large value to increase CI speed
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
            else:
                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
                                                              float(avg_cost)))
@@ -183,7 +183,7 @@ def train(use_cuda, train_program, save_path):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
                trainer.stop()

    train_reader = paddle.batch(
@@ -197,10 +197,10 @@ def train(use_cuda, train_program, save_path):
        feed_order=feed_order)


-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        inference_program, param_path=save_path, place=place)
+        inference_program, param_path=params_dirname, place=place)

    # Setup inputs by creating LoDTensors to represent sequences of words.
    # Here each word is the basic element of these LoDTensors and the shape of 
@@ -251,9 +251,9 @@ def infer(use_cuda, inference_program, save_path):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "label_semantic_roles.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "label_semantic_roles.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -57,7 +57,7 @@ def train_program():
    return [avg_cost, acc]


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)

@@ -78,7 +78,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)

            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
            else:
                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                    event.epoch + 1, avg_cost, acc))
@@ -100,11 +100,11 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['img', 'label'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)

    batch_size = 1
    tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -116,17 +116,17 @@ def infer(use_cuda, inference_program, save_dirname=None):


 def main(use_cuda):
-    save_dirname = "recognize_digits_conv.inference.model"
+    params_dirname = "recognize_digits_conv.inference.model"

    # call train() with is_local argument to run distributed train
    train(
        use_cuda=use_cuda,
        train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -44,7 +44,7 @@ def train_program():
    return [avg_cost, acc]


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)

@@ -62,7 +62,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)

            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
            else:
                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                    event.epoch + 1, avg_cost, acc))
@@ -81,11 +81,11 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['img', 'label'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)

    batch_size = 1
    tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -97,17 +97,17 @@ def infer(use_cuda, inference_program, save_dirname=None):


 def main(use_cuda):
-    save_dirname = "recognize_digits_mlp.inference.model"
+    params_dirname = "recognize_digits_mlp.inference.model"

    # call train() with is_local argument to run distributed train
    train(
        use_cuda=use_cuda,
        train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -155,7 +155,7 @@ def train_program():
    return [avg_cost, scale_infer]


-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.SGD(learning_rate=0.2)

@@ -180,7 +180,7 @@ def train(use_cuda, train_program, save_path):
            print("avg_cost: %s" % avg_cost)

            if float(avg_cost) < 4:  # Smaller value to increase CI speed
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
                trainer.stop()
            else:
                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
@@ -197,43 +197,30 @@ def train(use_cuda, train_program, save_path):
        num_epochs=1,
        event_handler=event_handler,
        reader=train_reader,
-        feed_order=[
-            'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id',
-            'category_id', 'movie_title', 'score'
-        ])
+        feed_order=feed_order)


-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        inference_program, param_path=save_path, place=place)
-
-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
-
-    # Generate a random input for inference
-    user_id = create_lod_tensor([[1]])
-    gender_id = create_lod_tensor([[1]])
-    age_id = create_lod_tensor([[0]])
-    job_id = create_lod_tensor([[10]])
-    movie_id = create_lod_tensor([[783]])
-    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
-    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                    [[0, 5]])
+        inference_program, param_path=params_dirname, place=place)
+
+    # Use the first data from paddle.dataset.movielens.test() as input.
+    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
+    # where `data` is a list of sequences of index numbers, `lod` is 
+    # the level of detail (lod) info associated with `data`.
+    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+    # two sequences of indexes, of length 3 and 2, respectively.
+    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+    # indicating that `data` consists of two sequences of length 3 and 2. 
+    user_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+    job_id = fluid.create_lod_tensor([[10]], [[1]], place)
+    movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
+    category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
+    movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
+                                          place)

    results = inferencer.infer(
        {
@@ -253,12 +240,15 @@ def infer(use_cuda, inference_program, save_path):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "recommender_system.inference.model"
-    train(use_cuda=use_cuda, train_program=train_program, save_path=save_path)
+    params_dirname = "recommender_system.inference.model"
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        save_path=save_path)
+        params_dirname=params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
+
 # default test
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -64,7 +64,7 @@ def train_program(word_dict):
    return [avg_cost, accuracy]


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)

@@ -85,7 +85,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)

            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()

            else:
@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()

    train_reader = paddle.batch(
@@ -112,13 +112,13 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['words', 'label'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    word_dict = paddle.dataset.imdb.word_dict()

    inferencer = fluid.Inferencer(
        infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
        place=place)

    # Setup input by creating LoDTensor to represent sequence of words.
@@ -143,9 +143,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -79,7 +79,7 @@ def train_program(word_dict):
    return [avg_cost, accuracy]


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)

@@ -100,7 +100,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)

            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()

            else:
@@ -112,7 +112,7 @@ def train(use_cuda, train_program, save_dirname):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()

    train_reader = paddle.batch(
@@ -127,13 +127,13 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['words', 'label'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    word_dict = paddle.dataset.imdb.word_dict()

    inferencer = fluid.Inferencer(
        infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
        place=place)

    # Setup input by creating LoDTensor to represent sequence of words.
@@ -158,9 +158,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -71,7 +71,7 @@ def train_program(word_dict):
    return [avg_cost, accuracy]


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)

@@ -92,7 +92,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)

            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()

            else:
@@ -104,7 +104,7 @@ def train(use_cuda, train_program, save_dirname):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()

    train_reader = paddle.batch(
@@ -119,13 +119,13 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['words', 'label'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    word_dict = paddle.dataset.imdb.word_dict()

    inferencer = fluid.Inferencer(
        infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
        place=place)

    # Setup input by creating LoDTensor to represent sequence of words.
@@ -150,9 +150,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "understand_sentiment_stacked_lstm.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "understand_sentiment_stacked_lstm.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -80,7 +80,7 @@ def train_program(is_sparse):
    return avg_cost


-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
    test_reader = paddle.batch(
@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
            print("loss= ", avg_cost)

            if avg_cost < 10.0:
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()

            if math.isnan(avg_cost):
@@ -115,10 +115,10 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])


-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)

    # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
    # is simply an index to look up for the corresponding word vector and hence 
@@ -153,17 +153,17 @@ def main(use_cuda, is_sparse):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return

-    save_path = "word2vec.inference.model"
+    params_dirname = "word2vec.inference.model"

    train(
        use_cuda=use_cuda,
        train_program=partial(train_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)

    infer(
        use_cuda=use_cuda,
        inference_program=partial(inference_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
    test_reader = paddle.batch(
        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)

-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-
-    def func_feed(feeding, data):
-        feed_tensors = {}
-        for (key, idx) in feeding.iteritems():
-            tensor = fluid.LoDTensor()
-            if key != "category_id" and key != "movie_title":
-                if key == "score":
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "float32")
-                else:
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "int64")
-            else:
-                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
-                                 data)
-                lod_info = [len(item) for item in numpy_data]
-                offset = 0
-                lod = [offset]
-                for item in lod_info:
-                    offset += item
-                    lod.append(offset)
-                numpy_data = np.concatenate(numpy_data, axis=0)
-                tensor.set_lod([lod])
-
-            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-            tensor.set(numpy_data, place)
-            feed_tensors[key] = tensor
-        return feed_tensors
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]

    def train_loop(main_program):
        exe.run(framework.default_startup_program())

+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch
                outs = exe.run(program=main_program,
-                               feed=func_feed(feeding, data),
+                               feed=feeder.feed(data),
                               fetch_list=[avg_cost])
                out = np.array(outs[0])
                if (batch_id + 1) % 10 == 0:
                    avg_cost_set = []
                    for test_data in test_reader():
-                        avg_cost_np = exe.run(
-                            program=test_program,
-                            feed=func_feed(feeding, test_data),
-                            fetch_list=[avg_cost])
+                        avg_cost_np = exe.run(program=test_program,
+                                              feed=feeder.feed(test_data),
+                                              fetch_list=[avg_cost])
                        avg_cost_set.append(avg_cost_np[0])
                        break  # test only 1 segment for speeding up CI

@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
-
    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -307,26 +260,33 @@ def infer(use_cuda, save_dirname=None):

        # Use the first data from paddle.dataset.movielens.test() as input
        assert feed_target_names[0] == "user_id"
-        user_id = create_lod_tensor([[1]])
+        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
+        # where `data` is a list of sequences of index numbers, `lod` is 
+        # the level of detail (lod) info associated with `data`.
+        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+        # two sequences of indexes, of length 3 and 2, respectively.
+        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+        # indicating that `data` consists of two sequences of length 3 and 2. 
+        user_id = fluid.create_lod_tensor([[1]], [[1]], place)

        assert feed_target_names[1] == "gender_id"
-        gender_id = create_lod_tensor([[1]])
+        gender_id = fluid.create_lod_tensor([[1]], [[1]], place)

        assert feed_target_names[2] == "age_id"
-        age_id = create_lod_tensor([[0]])
+        age_id = fluid.create_lod_tensor([[0]], [[1]], place)

        assert feed_target_names[3] == "job_id"
-        job_id = create_lod_tensor([[10]])
+        job_id = fluid.create_lod_tensor([[10]], [[1]], place)

        assert feed_target_names[4] == "movie_id"
-        movie_id = create_lod_tensor([[783]])
+        movie_id = fluid.create_lod_tensor([[783]], [[1]], place)

        assert feed_target_names[5] == "category_id"
-        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+        category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)

        assert feed_target_names[6] == "movie_title"
-        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                        [[0, 5]])
+        movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
+                                              [[5]], place)

        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
        # and results will contain a list of data corresponding to fetch_targets.

--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -53,11 +53,14 @@ class TestLoDTensor(unittest.TestCase):
        self.assertEqual(_convert_lod(lod), converted_lod)

    def test_create_lod_tensor(self):
-        # Only numpy array or a fluid LoDTensor is valid input to
-        # create_lod_tensor function, currently a list of lists is not.
-        data = [[1, 2], [3, 4]]
-        self.assertRaises(Exception, create_lod_tensor, data, [],
+        # Create LoDTensor from a list
+        data = [[1, 2, 3], [3, 4]]
+        wrong_lod = [[2, 2]]
+        correct_lod = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
                          fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 3, 5]])

        # Create LoDTensor from numpy array
        data = numpy.random.random([10, 1])

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -481,9 +481,9 @@ class OpTest(unittest.TestCase):
    def np_dtype_to_fluid_dtype(input):
        """Change the dtype of float16 numpy array

-        numpy float16 is binded to paddle::platform::float16 
+        numpy float16 is binded to paddle::platform::float16
        in tensor_py.h via the help of uint16 data type since
-        the internal memory representation of float16 is 
+        the internal memory representation of float16 is
        uint16_t in paddle and np.uint16 in numpy, which are
        themselves binded together by pybind.

@@ -491,9 +491,9 @@ class OpTest(unittest.TestCase):
            input: input numpy array

        Returns:
-            input: The dtype of input will be changed to np.uint16 if 
+            input: The dtype of input will be changed to np.uint16 if
                it is originally np.float16, such that the internal memory
-                of input will be reinterpreted as of dtype np.uint16. 
+                of input will be reinterpreted as of dtype np.uint16.
        """
        if input.dtype == np.float16:
            input.dtype = np.uint16

--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
        self.check_output()


+class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {
+            'Input': (np.random.random((31, 28)).astype("float32"),
+                      [[0, 9, 23, 31]])
+        }
+        self.attrs = {
+            'value': 3.5,
+            'shape': [-1, 16],
+            'input_dim_idx': 0,
+            'output_dim_idx': 0
+        }
+
+        out = np.random.random((3, 16)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,11 +369,13 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))

-    def test_bilinear_interp(self):
+    def test_upsampling_bilinear2d(self):
        program = Program()
        with program_guard(program):
            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.bilinear_interp(x, 12, 12)
+            output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
+            self.assertIsNotNone(output)
+            output = layers.upsampling_bilinear2d(x, scale=3)
            self.assertIsNotNone(output)
        print(str(program))


--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def PolygonBoxRestore(input):
+    shape = input.shape
+    batch_size = shape[0]
+    geo_channels = shape[1]
+    h = shape[2]
+    w = shape[3]
+    h_indexes = np.array(range(h) * w).reshape(
+        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
+    w_indexes = np.array(range(w) * h).reshape(
+        [h, w])[np.newaxis, :]  # [1, h, w]
+    indexes = np.concatenate(
+        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
+    indexes = indexes.repeat(
+        [geo_channels / 2],
+        axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
+    indexes = indexes.repeat(
+        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
+    return indexes.reshape(
+        input.shape) - input  # [batch_size, geo_channels, h, w]
+
+
+class TestPolygonBoxRestoreOp(OpTest):
+    def config(self):
+        self.input_shape = (1, 8, 2, 2)
+
+    def setUp(self):
+        self.config()
+        self.op_type = "polygon_box_transform"
+        input = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'Input': input}
+        output = PolygonBoxRestore(input)
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (2, 10, 3, 2)
+
+
+class TestCase2(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (3, 12, 4, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_preprocessor.py
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+
+
+class TestPreprocessor(unittest.TestCase):
+    def setUp(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_for_preprocessor_test.recordio', reader, feeder)
+
+    def test_main(self):
+        N = 10
+
+        img_expected_res = []
+        lbl_expected_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_expected_res.append(img_v / 2)
+                lbl_expected_res.append(lbl_v + 1)
+
+        img_actual_res = []
+        lbl_actual_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_actual_res.append(img_v)
+                lbl_actual_res.append(lbl_v)
+
+        for idx in range(N):
+            np.allclose(img_expected_res[idx], img_actual_res[idx])
+            np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -107,7 +107,7 @@ class ControlFlowGraph(object):
        # Repeatedly apply liveness updates until the algorithm stablize
        # on a complete set live input vars and live output vars.
        while True:
-            for i in range(self.op_size, 0, -1):
+            for i in reversed(range(self.op_size)):
                live_in[i] = set(self._live_in[i])
                live_out[i] = set(self._live_out[i])
                for s in self._successors[i]:

--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DocstringChecker is used to check python doc string's style."""
+
+import six
+import astroid
+
+from pylint.checkers import BaseChecker, utils
+from pylint.interfaces import IAstroidChecker
+
+from collections import defaultdict
+import re
+
+
+def register(linter):
+    """Register checkers."""
+    linter.register_checker(DocstringChecker(linter))
+
+
+class Docstring(object):
+    """Docstring class holds the parsed doc string elements.
+    """
+
+    def __init__(self):
+        self.d = defaultdict(list)  #name->[]
+        self.clear()
+
+    def clear(self):
+        self.d['Args'] = []
+        self.d['Examples'] = []
+        self.d['Returns'] = []
+        self.d['Raises'] = []
+        self.args = {}  #arg_name->arg_type
+
+    def get_level(self, string, indent='    '):
+        level = 0
+        unit_size = len(indent)
+        while string[:unit_size] == indent:
+            string = string[unit_size:]
+            level += 1
+
+        return level
+
+    def parse(self, doc):
+        """parse gets sections from doc
+        Such as Args, Returns, Raises, Examples s
+        Args:
+            doc (string): is the astroid node doc string.
+        Returns:
+            True if doc is parsed successfully.
+        """
+        self.clear()
+
+        lines = doc.splitlines()
+        state = ("others", -1)
+        for l in lines:
+            c = l.strip()
+            if len(c) <= 0:
+                continue
+
+            level = self.get_level(l)
+            if c.startswith("Args:"):
+                state = ("Args", level)
+            elif c.startswith("Returns:"):
+                state = ("Returns", level)
+            elif c.startswith("Raises:"):
+                state = ("Raises", level)
+            elif c.startswith("Examples:"):
+                state = ("Examples", level)
+            else:
+                if level > state[1]:
+                    self.d[state[0]].append(c)
+                    continue
+
+                state = ("others", -1)
+                self.d[state[0]].append(c)
+
+        self._arg_with_type()
+        return True
+
+    def get_returns(self):
+        return self.d['Returns']
+
+    def get_raises(self):
+        return self.d['Raises']
+
+    def get_examples(self):
+        return self.d['Examples']
+
+    def _arg_with_type(self):
+
+        for t in self.d['Args']:
+            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            if m:
+                self.args[m.group(1)] = m.group(2)
+
+        return self.args
+
+
+class DocstringChecker(BaseChecker):
+    """DosstringChecker is pylint checker to
+    check docstring style.
+    """
+    __implements__ = (IAstroidChecker, )
+
+    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
+    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
+
+    name = 'doc-string-checker'
+    symbol = "doc-string"
+    priority = -1
+    msgs = {
+        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
+                  'Used when a short doc string is on multiple lines'),
+        'W9002':
+        ('Doc string does not end with "." period', symbol + "-end-with",
+         'Used when a doc string does not end with a period'),
+        'W9003': ('All args with their types must be mentioned in doc string',
+                  symbol + "-with-all-args",
+                  'Used when not all arguments are in the doc string '),
+        'W9005': ('Missing docstring or docstring is too short',
+                  symbol + "-missing", 'Add docstring longer >=10'),
+        'W9006': ('Docstring indent error, use 4 space for indent',
+                  symbol + "-indent-error", 'Use 4 space for indent'),
+        'W9007': ('You should add `Returns` in comments',
+                  symbol + "-with-returns",
+                  'There should be a `Returns` section in comments'),
+        'W9008': ('You should add `Raises` section in comments',
+                  symbol + "-with-raises",
+                  'There should be a `Raises` section in comments'),
+    }
+    options = ()
+
+    def visit_functiondef(self, node):
+        """visit_functiondef checks Function node docstring style.
+        Args:
+            node (astroid.node): The visiting node.
+        Returns:
+            True if successful other wise False.
+        """
+
+        self.check_doc_string(node)
+
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if not node.doc:
+            return True
+
+        doc = Docstring()
+        doc.parse(node.doc)
+
+        self.all_args_in_doc(node, doc)
+        self.with_returns(node, doc)
+        self.with_raises(node, doc)
+
+    def visit_module(self, node):
+        self.check_doc_string(node)
+
+    def visit_classdef(self, node):
+        self.check_doc_string(node)
+
+    def check_doc_string(self, node):
+        self.missing_doc_string(node)
+        self.one_line(node)
+        self.has_period(node)
+        self.indent_style(node)
+
+    def missing_doc_string(self, node):
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if node.doc is None or len(node.doc) < 10:
+            self.add_message('W9005', node=node, line=node.fromlineno)
+        return False
+
+    # FIXME(gongwb): give the docstring line-no
+    def indent_style(self, node, indent=4):
+        """indent_style checks docstring's indent style
+        Args:
+            node (astroid.node): The visiting node.
+            indent (int): The default indent of style
+        Returns:
+            True if successful other wise False.
+        """
+        if node.doc is None:
+            return True
+
+        doc = node.doc
+        lines = doc.splitlines()
+
+        for l in lines:
+            cur_indent = len(l) - len(l.lstrip())
+            if cur_indent % indent != 0:
+                self.add_message('W9006', node=node, line=node.fromlineno)
+                return False
+
+        return True
+
+    def one_line(self, node):
+        """one_line checks if docstring (len < 40) is on one line.
+        Args:
+            node (astroid.node): The node visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        doc = node.doc
+        if doc is None:
+            return True
+
+        if len(doc) > 40:
+            return True
+        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
+            return True
+        else:
+            self.add_message('W9001', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def has_period(self, node):
+        """has_period checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+        if node.doc is None:
+            return True
+
+        if len(node.doc.splitlines()) > 1:
+            return True
+
+        if not node.doc.strip().endswith('.'):
+            self.add_message('W9002', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_raises(self, node, doc):
+        """with_raises checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Raise):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_raises()) == 0:
+            self.add_message('W9008', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_returns(self, node, doc):
+        """with_returns checks if docstring comments what are returned .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Return):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_returns()) == 0:
+            self.add_message('W9007', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def all_args_in_doc(self, node, doc):
+        """all_args_in_doc checks if arguments are mentioned in doc
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object
+        Returns:
+            True if successful otherwise False.
+        """
+        args = []
+        for arg in node.args.get_children():
+            if (not isinstance(arg, astroid.AssignName)) \
+                or arg.name == "self":
+                continue
+            args.append(arg.name)
+
+        if len(args) <= 0:
+            return True
+
+        parsed_args = doc.args
+        if len(args) > 0 and len(parsed_args) <= 0:
+            print "debug:parsed args: ", parsed_args
+            self.add_message('W9003', node=node, line=node.fromlineno)
+            return False
+
+        for t in args:
+            if t not in parsed_args:
+                print t, " with (type) not in ", parsed_args
+                self.add_message('W9003', node=node, line=node.fromlineno)
+                return False
+
+        return True
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export PYTHONPATH=$DIR:$PYTHONPATH
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+    pylint --disable=all --load-plugins=docstring_checker \
+    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+#exit $TOTAL_ERRORS
+#For now, just warning:
+exit 0
+
--- a/tools/codestyle/test_docstring_checker.py
+++ b/tools/codestyle/test_docstring_checker.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import docstring_checker
+import pylint.testutils
+import astroid
+import pytest
+import sys
+
+
+class TestDocstring(pylint.testutils.CheckerTestCase):
+    CHECKER_CLASS = docstring_checker.DocstringChecker
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get 
+            news.
+            """
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9001' == got[0][0]
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news"""
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9002' == got[0][0]
+
+    def test_args(self):
+        func_node = astroid.extract_node('''
+        def test(scale, mean): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9003' == got[0][0]
+
+    def test_missing(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9005' == got[0][0]
+
+    def test_indent(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """ get get get get get get get get
+              get get get get get get get get.
+            """
+            pass 
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9006' == got[0][0]
+
+    def test_with_resturns(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            return mean
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9007' == got[0][0]
+
+    def test_with_raises(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            raise ValueError('A very specific bad thing happened.')
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9008' == got[0][0]
+
+    def test_no_message(self):
+        p = '''
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable called weights for each input tensor, which represents
+    a fully connected weight matrix from each input unit to each output unit.
+    The fully connected layer multiplies each input tensor with its coresponding
+    weight to produce an output Tensor. If multiple input tensors are given,
+    the results of multiple multiplications will be sumed up. If bias_attr is
+    not None, a bias variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+    This process can be formulated as follows:
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to None, no bias will be added to the output units.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+    Returns:
+        A tensor variable storing the transformation result.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
+    raise ValueError('A very specific bad thing happened.')
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    return size
+    '''
+
+        func_node = astroid.extract_node(p)
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 0