Merge remote-tracking branch 'ups/develop' into mklml_funcs

afbc4ce4 · tensor-tang · 537f57a4 · 34e093ae · afbc4ce4 · afbc4ce4
86 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,7 +57,10 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
+option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
+option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -202,7 +205,7 @@ endif(USE_NNPACK)
 add_subdirectory(proto)
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
    # "add_subdirectory(go)" should be placed after the following loine,
    # because it depends on paddle/optimizer.
    add_subdirectory(paddle/optimizer)
@@ -230,3 +233,7 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
+if (WITH_CONTRIB)
+    add_subdirectory(paddle/contrib)
+endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -101,6 +101,3 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
    rm -rf /opt/android-ndk-tmp
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,22 +24,22 @@ Currently supported `--model` argument include:
 * Run the following command to start a benchmark job locally:
    ```bash
-      python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test
+      python fluid_benchmark.py --model mnist  --device GPU
    ```
    You can choose to use GPU/CPU training. With GPU training, you can specify
-    `--parallel 1` to run multi GPU training.
+    `--gpus <gpu_num>` to run multi GPU training.
 * Run distributed training with parameter servers:
    * start parameter servers:
        ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        ```
    * start trainers:
        ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        ```
 * Run distributed training using NCCL2
    ```bash
-    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
    ```
 ## Run Distributed Benchmark on Kubernetes Cluster
@@ -48,7 +48,7 @@ We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submi
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
 ```
 Then the yaml files are generated under directory `myjob`, you can run:

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)
+if(EIGEN_USE_THREADS)
+    add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
 if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
+            -DCMAKE_SKIP_RPATH=ON
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1003,9 +1003,9 @@ dice_loss
 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:
-bilinear_interp
+upsampling_bilinear2d
 ____
-..  autofunction:: paddle.fluid.layers.bilinear_interp
+..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
    :noindex:
--- a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
--- a/doc/fluid/images/1.png
+++ b/doc/fluid/images/1.png
--- a/doc/fluid/images/2.png
+++ b/doc/fluid/images/2.png
--- a/doc/fluid/images/3.png
+++ b/doc/fluid/images/3.png
--- a/doc/fluid/images/4.png
+++ b/doc/fluid/images/4.png
--- a/doc/fluid/images/LoDTensor.png
+++ b/doc/fluid/images/LoDTensor.png
--- a/doc/fluid/images/compile_run_time.png
+++ b/doc/fluid/images/compile_run_time.png
--- a/doc/fluid/images/executor.png
+++ b/doc/fluid/images/executor.png
--- a/doc/fluid/images/fluid_examples.png
+++ b/doc/fluid/images/fluid_examples.png
--- a/doc/fluid/images/fluid_module_1.png
+++ b/doc/fluid/images/fluid_module_1.png
--- a/doc/fluid/images/fluid_module_2.png
+++ b/doc/fluid/images/fluid_module_2.png
--- a/doc/fluid/images/layer.png
+++ b/doc/fluid/images/layer.png
--- a/doc/fluid/images/operator1.png
+++ b/doc/fluid/images/operator1.png
--- a/doc/fluid/images/operator2.png
+++ b/doc/fluid/images/operator2.png
--- a/doc/fluid/images/place.png
+++ b/doc/fluid/images/place.png
--- a/doc/fluid/images/print_fluid_program.png
+++ b/doc/fluid/images/print_fluid_program.png
--- a/doc/fluid/images/program_desc1.png
+++ b/doc/fluid/images/program_desc1.png
--- a/doc/fluid/images/program_desc2.png
+++ b/doc/fluid/images/program_desc2.png
--- a/doc/fluid/images/raw_input.png
+++ b/doc/fluid/images/raw_input.png
--- a/doc/fluid/images/scope_variable_tensor.png
+++ b/doc/fluid/images/scope_variable_tensor.png
--- a/doc/fluid/images/sorted_input.png
+++ b/doc/fluid/images/sorted_input.png
--- a/doc/fluid/images/transpiler.png
+++ b/doc/fluid/images/transpiler.png
--- a/doc/fluid/images/user_interface.png
+++ b/doc/fluid/images/user_interface.png
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
-构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
@@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
+   ./paddle/scripts/paddle_build.sh build
-   cd /paddle/build
+   cd build
   ctest -R test_sum_op -V
 .. _faq_docker:

--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
+into :code:`/paddle` directory inside docker container.
-(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
-command in step 3.
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -74,15 +72,15 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 If you wish to run only one unit test, like :code:`test_sum_op`:
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
+   ./paddle/scripts/paddle_build.sh build
-   cd /paddle/build
+   cd build
   ctest -R test_sum_op -V
 .. _faq_docker:

--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -98,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 国内用户可以使用下面的镜像源来加速访问：
-  .. code-block: bash
+  .. code-block:: bash
    docker run -p 8888:8888 docker.paddlepaddlehub.com/book

--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -105,7 +105,7 @@ We provide a packaged book image, simply issue the command:
 For users in China, we provide a faster mirror:
-  .. code-block: bash
+  .. code-block:: bash
    docker run -p 8888:8888 docker.paddlepaddlehub.com/book

--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -11,7 +11,6 @@ GTAGS
 *.pb.cc
 *.pb.h
 *_pb2.py
-paddle_*
 output/
 google/
 Makefile

--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(inference)
--- a/paddle/contrib/float16/README.md
+++ b/paddle/contrib/float16/README.md
@@ -89,7 +89,7 @@ cd Paddle
 # to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
 nvidia-docker build -t paddle:float16 .
 # After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
 ```
 #### Accuracy

--- a/paddle/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -3,7 +3,7 @@
 BUILD_PATH=/paddle/fp16_build
 WHEEL_PATH=$BUILD_PATH/python/dist
 INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/contrib/float16
+DEMO_PATH=/paddle/paddle/contrib/float16
 # Use the single most powerful CUDA GPU on your machine
 export CUDA_VISIBLE_DEVICES=0
@@ -50,7 +50,6 @@ do
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=imagenet \
      --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
      --repeat=$REPEAT \
@@ -68,7 +67,6 @@ do
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_resnet \
-      --data_set=imagenet \
      --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
      --repeat=$REPEAT \
@@ -86,7 +84,6 @@ do
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
      --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
      --repeat=$REPEAT \
@@ -104,7 +101,6 @@ do
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
      --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
      --repeat=$REPEAT \

--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+    set(arg_list "")
+    if(inference_test_ARGS)
+        foreach(arg ${inference_test_ARGS})
+            list(APPEND arg_list "_${arg}")
+        endforeach()
+    else()
+        list(APPEND arg_list "_")
+    endif()
+    foreach(arg ${arg_list})
+        string(REGEX REPLACE "^_$" "" arg "${arg}")
+        cc_test(${TARGET_NAME}
+                SRCS ${TEST_SRC}
+                DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        # set_tests_properties(${TARGET_NAME}
+        #         PROPERTIES DEPENDS ${DEP_TEST})
+    endforeach()
+endfunction(inference_api_test)
+cc_library(paddle_inference_api
+    SRCS paddle_inference_api.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+cc_library(paddle_inference_api_impl
+           SRCS paddle_inference_api_impl.cc
+           DEPS paddle_inference_api paddle_fluid_api)
+cc_test(test_paddle_inference_api
+        SRCS test_paddle_inference_api.cc
+        DEPS paddle_inference_api)
+inference_api_test(test_paddle_inference_api_impl
+                   test_paddle_inference_api_impl.cc
+                   test_word2vec)
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/contrib/inference/paddle_inference_api.h"
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -12,49 +12,74 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 namespace paddle {
-class Predictor {
+enum PaddleDType {
-public:
+  FLOAT32,
-  struct Attr;
+  INT64,
-  Predictor() = default;
+};
-  // Build the network before inference.
+struct PaddleBuf {
-  bool Init(const Attr& attr);
+  void* data;     // pointer to the data memory.
+  size_t length;  // number of memory bytes.
+};
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+/*
+* A simple Inference API for Paddle. Currently this API might just be used by
+* non-sequence scenerios.
+* TODO(Superjomn) Prepare another API for NLP-related usages.
+*/
+class PaddlePredictor {
+public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
  // Predict an record.
-  // Arguments:
+  // The caller should be responsible for allocating and releasing the memory of
-  //   inputs: the name of the input variables.
+  // `inputs`. `inputs` should be alive until Run returns. caller should be
-  //   outputs: the name of the output varaibles.
+  // responsible for releasing the memory of `output_data`.
-  //   input_shapes: the shape of the input variables.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-  //   output_shapes: the shape of the output variables.
+                   std::vector<PaddleTensor>* output_data) = 0;
-  //   input_data: the data of the input variables.
-  //   output_data: the data of the output variables.
+  // Clone a predictor that share the model weights, the Cloned predictor should
-  bool Run(const std::vector<std::string>& inputs,
+  // be thread-safe.
-           const std::vector<std::string>& outputs,
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
-           const std::vector<std::vector<int>>& input_shapes,
-           const std::vector<std::vector<int>>& output_shapes,
-           const std::vector<std::vector<float>>& input_data,
-           std::vector<std::vector<float>>* output_data);
-  // Clone a predictor that share the model weights.
-  Predictor* Clone();
  // Destroy the Predictor.
-  ~Predictor();
+  virtual ~PaddlePredictor() {}
-  struct Attr {
+  friend std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
+      const PaddlePredictor::Config& config);
+  // The common configs for all the predictors.
+  struct Config {
    enum class EngineKind;
    std::string model_dir;      // path to the model directory.
    bool enable_engine{false};  // Enable to execute (part of) the model on
-                                // third-party engines.
+    // third-party engines.
-    EngineKind engine_kind{Attr::EngineKind::kNone};
+    EngineKind engine_kind{Config::EngineKind::kNone};
    enum class EngineKind {
      kNone = -1,          // Use the native Fluid facility.
@@ -66,4 +91,8 @@ public:
  };
 };
+// A factory to help create difference predictor.
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <sys/time.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+namespace paddle {
+namespace {
+// Timer for timer
+class Timer {
+public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+template <class T>
+std::string num2str(T a) {
+  std::stringstream istr;
+  istr << a;
+  return istr.str();
+}
+}  // namespace
+bool PaddlePredictorImpl::Init() {
+  VLOG(3) << "Predictor::init()";
+  // TODO(panyx0718): Should CPU vs GPU device be decided by id?
+  if (config_.device >= 0) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  paddle::framework::InitDevices(false);
+  executor_.reset(new paddle::framework::Executor(place_));
+  scope_.reset(new paddle::framework::Scope());
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+  // Create variables
+  // TODO(panyx0718): Why need to test share_variables here?
+  if (config_.share_variables) {
+    executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+  }
+  // Get the feed_target_names and fetch_target_names
+  feed_target_names_ = inference_program_->GetFeedTargetNames();
+  fetch_target_names_ = inference_program_->GetFetchTargetNames();
+  return true;
+}
+bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
+                              std::vector<PaddleTensor> *output_data) {
+  VLOG(3) << "Predictor::predict";
+  Timer timer;
+  timer.tic();
+  // set feed variable
+  std::map<std::string, const paddle::framework::LoDTensor *> feed_targets;
+  std::vector<paddle::framework::LoDTensor> feeds;
+  if (!SetFeed(inputs, &feeds)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    feed_targets[feed_target_names_[i]] = &feeds[i];
+  }
+  // get fetch variable
+  std::map<std::string, paddle::framework::LoDTensor *> fetch_targets;
+  std::vector<paddle::framework::LoDTensor> fetchs;
+  fetchs.resize(fetch_target_names_.size());
+  for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
+    fetch_targets[fetch_target_names_[i]] = &fetchs[i];
+  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->RunPreparedContext(ctx_.get(),
+                                scope_.get(),
+                                &feed_targets,
+                                &fetch_targets,
+                                !config_.share_variables);
+  if (!GetFetch(fetchs, output_data)) {
+    LOG(ERROR) << "fail to get fetchs";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
+std::unique_ptr<PaddlePredictor> PaddlePredictorImpl::Clone() {
+  VLOG(3) << "Predictor::clone";
+  std::unique_ptr<PaddlePredictorImpl> cls(new PaddlePredictorImpl(config_));
+  if (!cls->InitShared(this)) {
+    LOG(ERROR) << "fail to call InitShared";
+    return nullptr;
+  }
+  return cls;
+}
+// TODO(panyx0718): Consider merge with Init()?
+bool PaddlePredictorImpl::InitShared(PaddlePredictorImpl *cls) {
+  VLOG(3) << "Predictor::init_shared";
+  // 1. Define place, executor, scope
+  if (this->config_.device >= 0) {
+    place_ = paddle::platform::CUDAPlace();
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  this->executor_.reset(new paddle::framework::Executor(this->place_));
+  this->scope_.reset(new paddle::framework::Scope());
+  // Initialize the inference program
+  if (!this->config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    this->inference_program_ = paddle::inference::Load(
+        this->executor_.get(), this->scope_.get(), this->config_.model_dir);
+  } else if (!this->config_.prog_file.empty() &&
+             !this->config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    this->inference_program_ =
+        paddle::inference::Load(this->executor_.get(),
+                                this->scope_.get(),
+                                this->config_.prog_file,
+                                this->config_.param_file);
+  }
+  this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
+  // 3. create variables
+  // TODO(panyx0718): why test share_variables.
+  if (config_.share_variables) {
+    this->executor_->CreateVariables(
+        *this->inference_program_, this->scope_.get(), 0);
+  }
+  // 4. Get the feed_target_names and fetch_target_names
+  this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
+  this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
+  return true;
+}
+bool PaddlePredictorImpl::SetFeed(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<paddle::framework::LoDTensor> *feeds) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feed_target_names_.size()) {
+    LOG(ERROR) << "wrong feed input size.";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    paddle::framework::LoDTensor input;
+    paddle::framework::DDim ddim =
+        paddle::framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr =
+          input.mutable_data<int64_t>(ddim, paddle::platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, paddle::platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr),
+                inputs[i].data.data,
+                inputs[i].data.length);
+    feeds->push_back(input);
+    LOG(ERROR) << "Actual feed type " << feeds->back().type().name();
+  }
+  return true;
+}
+bool PaddlePredictorImpl::GetFetch(
+    const std::vector<paddle::framework::LoDTensor> &fetchs,
+    std::vector<PaddleTensor> *outputs) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs.size());
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    // TODO(panyx0718): Support fetch of other types.
+    if (fetchs[i].type() != typeid(float)) {
+      LOG(ERROR) << "only support fetching float now.";
+      return false;
+    }
+    std::vector<int> shape;
+    auto dims_i = fetchs[i].dims();
+    auto lod = fetchs[i].lod();
+    const float *output_ptr = fetchs[i].data<float>();
+    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
+    auto num = fetchs[i].numel();
+    std::vector<float> data;
+    if (0 == lod.size()) {
+      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+      for (int j = 0; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    } else {
+      // for batch detection
+      // image[0] -> output[0] shape {145, 6}
+      // image[1] -> output[1] shape {176, 6}
+      // then,
+      // the batch output shape {321, 6}
+      // the lod {{0, 145, 321}}
+      // so we should append output[0] to {176, 6}
+      size_t max_dim = 0;
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+      }
+      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+      if (max_dim > 0) {
+        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+      }
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        size_t start = lod[0][j - 1] * common_dim;
+        size_t end = lod[0][j] * common_dim;
+        if (end > start) {
+          std::copy(output_ptr + start,
+                    output_ptr + end,
+                    data.begin() + (j - 1) * max_dim * common_dim);
+        }
+      }
+      shape.push_back(lod[0].size() - 1);
+      shape.push_back(max_dim);
+      for (int j = 1; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    }
+    outputs->at(i).shape = shape;
+    outputs->at(i).data.length = sizeof(float) * data.size();
+    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
+    std::memcpy(
+        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    outputs->at(i).dtype = PaddleDType::FLOAT32;
+    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
+  }
+  return true;
+}
+std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
+    const VisConfig &config) {
+  VLOG(3) << "create PaddlePredictorImpl";
+  // 1. GPU memeroy
+  std::vector<std::string> flags;
+  if (config.fraction_of_gpu_memory >= 0.0f ||
+      config.fraction_of_gpu_memory <= 0.95f) {
+    flags.push_back("dummpy");
+    std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                       num2str<float>(config.fraction_of_gpu_memory);
+    flags.push_back(flag);
+    VLOG(3) << "set flag: " << flag;
+    framework::InitGflags(flags);
+  }
+  std::unique_ptr<PaddlePredictorImpl> predictor(
+      new PaddlePredictorImpl(config));
+  if (!predictor->Init()) {
+    return nullptr;
+  }
+  return predictor;
+}
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+struct VisConfig : public PaddlePredictor::Config {
+  int device;
+  float fraction_of_gpu_memory;
+  std::string prog_file;
+  std::string param_file;
+  bool share_variables;
+};
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class PaddlePredictorImpl : public PaddlePredictor {
+public:
+  explicit PaddlePredictorImpl(const VisConfig &config) : config_(config) {}
+  bool Init();
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override;
+  std::unique_ptr<PaddlePredictor> Clone() override;
+  ~PaddlePredictorImpl() override{};
+private:
+  bool InitShared(PaddlePredictorImpl *cls);
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               std::vector<paddle::framework::LoDTensor> *feeds);
+  bool GetFetch(const std::vector<paddle::framework::LoDTensor> &fetchs,
+                std::vector<PaddleTensor> *output_data);
+  VisConfig config_;
+  paddle::platform::Place place_;
+  std::unique_ptr<paddle::framework::Executor> executor_;
+  std::unique_ptr<paddle::framework::Scope> scope_;
+  std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx_;
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program_;
+  std::vector<std::string> feed_target_names_;
+  std::vector<std::string> fetch_target_names_;
+};
+std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
+    const VisConfig &config);
+}  // namespace paddle
--- a/paddle/contrib/inference/test_paddle_inference_api.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+namespace paddle {
+/*
+ * Do not use this, just a demo indicating how to customize a config for a
+ * specific predictor.
+ */
+struct DemoConfig : public PaddlePredictor::Config {
+  float other_config;
+};
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class DemoPredictor : public PaddlePredictor {
+public:
+  explicit DemoPredictor(const DemoConfig &config) {
+    LOG(INFO) << "I get other_config " << config.other_config;
+  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override {
+    LOG(INFO) << "Run";
+    return false;
+  }
+  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
+  ~DemoPredictor() override {}
+};
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
+    const DemoConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
+  return x;
+}
+TEST(paddle_inference_api, demo) {
+  DemoConfig config;
+  config.other_config = 1.7;
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> outputs;
+  predictor->Run({}, &outputs);
+}
+}  // namespace paddle
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+DEFINE_string(dirname, "", "Directory of the inference model.");
+namespace paddle {
+PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
+  PaddleTensor pt;
+  pt.data.data = t->data<void>();
+  if (t->type() == typeid(int64_t)) {
+    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.dtype = PaddleDType::INT64;
+  } else if (t->type() == typeid(float)) {
+    pt.data.length = t->numel() * sizeof(float);
+    pt.dtype = PaddleDType::FLOAT32;
+  } else {
+    LOG(FATAL) << "unsupported type.";
+  }
+  pt.shape = framework::vectorize2int(t->dims());
+  return pt;
+}
+TEST(paddle_inference_api_impl, word2vec) {
+  VisConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.85;
+  config.device = 0;
+  config.share_variables = true;
+  std::unique_ptr<PaddlePredictorImpl> predictor =
+      CreatePaddlePredictorImpl(config);
+  framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2073;  // The size of dictionary
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  std::vector<PaddleTensor> cpu_feeds;
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&first_word));
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&second_word));
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&third_word));
+  cpu_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(cpu_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    size_t len = outputs[i].data.length;
+    float* data = static_cast<float*>(outputs[i].data.data);
+    for (int j = 0; j < len / sizeof(float); ++j) {
+      ASSERT_LT(data[j], 1.0);
+      ASSERT_GT(data[j], -1.0);
+    }
+    free(outputs[i].data.data);
+  }
+}
+}  // namespace paddle
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -243,13 +243,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
 }
 void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  for (auto &input : inputs_) {
+  RenameInput(old_name, new_name);
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  RenameOutput(old_name, new_name);
-  }
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
  need_update_ = true;
 }
@@ -274,6 +269,13 @@ void OpDesc::RenameInput(const std::string &old_name,
  for (auto &input : inputs_) {
    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
  }
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
  need_update_ = true;
 }

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
+# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -149,12 +149,14 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  }
  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
    // GPU data is copied to CPU buffer when sending,
    // free the buffer when possible.
    destroy_callback = [](void* backing) {
      platform::CUDAPinnedPlace cuda_pinned;
      memory::Free(cuda_pinned, backing);
    };
+#endif
  }
  std::string header;

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -24,6 +24,8 @@ detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
+detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
+    polygon_box_transform_op.cu)
 # Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    int batch_size = in_dims[0];
+    int geo_channel = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int id = 0;
+    for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+      for (int id_h = 0; id_h < height; ++id_h) {
+        for (int id_w = 0; id_w < width; ++id_w) {
+          id = id_n * height * width + width * id_h + id_w;
+          if (id_n % 2 == 0) {
+            out_data[id] = id_w - in_data[id];
+          } else {
+            out_data[id] = id_h - in_data[id];
+          }
+        }
+      }
+    }
+  }
+};
+class PolygonBoxTransformOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input (Input) of polygon_box transform op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Output"),
+        "Output (Output) of polygon_box transform op should not be null.");
+    auto in_dim = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
+    PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
+                      "input's second dimension must be even.");
+    ctx->SetOutputDim("Output", in_dim);
+  }
+};
+class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "The input with shape [batch_size, geometry_channels, height, width]");
+    AddOutput("Output", "The output with the same shape as input");
+    AddComment(R"DOC(
+PolygonBoxTransform Operator.
+The input is the final geometry output in detection network.
+We use 2*n numbers to denote the coordinate shift from n corner vertices of
+the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
+the geometry output contains 2*n channels.
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
+                  ops::PolygonBoxTransformOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    polygon_box_transform,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_BLOCK_SIZE 16
+template <typename T>
+__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
+                                          const T* input, T* output) {
+  int id_n = threadIdx.x + blockDim.x * blockIdx.x;
+  int id_h = threadIdx.y + blockDim.y * blockIdx.y;
+  int id_w = threadIdx.z + blockDim.z * blockIdx.z;
+  if (id_n < n && id_h < h && id_w < w) {
+    int id = id_n * h * w + w * id_h + id_w;
+    if (id_n % 2 == 0) {
+      output[id] = id_w - input[id];
+    } else {
+      output[id] = id_h - input[id];
+    }
+  }
+}
+template <typename T>
+class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    int batch_size = in_dims[0];
+    int geo_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    dim3 threadsPerBlock(
+        PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
+        CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
+    dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
+                   (height + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (width + threadsPerBlock.z - 1) / threadsPerBlock.z);
+    auto stream = ctx.cuda_device_context().stream();
+    PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
+        batch_size * geo_channels, height, width, in_data, out_data);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_CUDA_KERNEL(
+    polygon_box_transform,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("Input");
+    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size for the LoDTensor.
+      auto odims = out->dims();
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
    out->mutable_data<T>(ctx.GetPlace());
    auto value = ctx.Attr<float>("value");

--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -46,7 +46,10 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
      const int64_t* label_data = labels->data<int64_t>();
      for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
+        int lbl = label_data[i];
+        PADDLE_ENFORCE_GE(lbl, 0);
+        PADDLE_ENFORCE_LT(lbl, class_num);
+        int index = i * class_num + lbl;
        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
      }
    }

--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <glog/logging.h>
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/function/EigenThreadDevice.h"
 namespace paddle {
@@ -70,25 +70,26 @@ struct EigenBlasGemm {
    dims[0].first = transA ? 0 : 1;
    dims[0].second = transB ? 1 : 0;
-    Eigen::DefaultDevice device;
+    auto* device = EigenDeviceWarpper::device();
    if (N == ldc) {
      if (alpha == T(1) && beta == T(0)) {
-        c.device(device) = a.contract(b, dims);
+        c.device(*device) = a.contract(b, dims);
      } else if (alpha == T(1) && beta == T(1)) {
-        c.device(device) += a.contract(b, dims);
+        c.device(*device) += a.contract(b, dims);
      } else {
-        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
      }
    } else {
      if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
      } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
      } else {
-        c.slice(offsetC, extentC).device(device) =
+        c.slice(offsetC, extentC).device(*device) =
            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
      }
    }
+    EigenDeviceWarpper::free_device(device);
  }
 };

--- a/paddle/function/EigenThreadDevice.h
+++ b/paddle/function/EigenThreadDevice.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+#pragma once
+#if defined(__OSX__) || defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+#if defined(__ANDROID__)
+int GetCpuCount() {
+  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
+  if (!fp) {
+    return 1;
+  }
+  int rank0, rank1;
+  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
+  fclose(fp);
+  if (num < 2) return 1;
+  return rank1 + 1;
+}
+#elif defined(__OSX__) || defined(__APPLE__)
+int GetCpuCount() {
+  int count = 0;
+  size_t len = sizeof(int);
+  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+  return count > 0 ? count : 1;
+}
+#else
+int GetCpuCount() { return 1; }
+#endif
+class EigenDeviceWarpper {
+public:  // NOLINT
+#if EIGEN_USE_THREADS
+  static Eigen::ThreadPoolDevice* device() {
+    const int num_cpus = GetCpuCount();
+    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
+    static Eigen::ThreadPool tp(num_threads);
+    static Eigen::ThreadPoolDevice* device =
+        new Eigen::ThreadPoolDevice(&tp, num_threads);
+    return device;
+  }
+  static void free_device(Eigen::ThreadPoolDevice* device) {
+    // do nothing
+  }
+#else
+  static Eigen::DefaultDevice* device() {
+    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
+    return device;
+  }
+  static void free_device(Eigen::DefaultDevice* device) { delete device; }
+#endif
+};
+}  // namespace paddle
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -7,6 +7,10 @@ set(OPITMIZER_SRCS
    sgd_optimizer.cc
  )
-cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
+add_library(paddle_optimizer ${OPITMIZER_SRCS})
-cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
+target_link_libraries(paddle_optimizer paddle_proto glog)
-cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
+if (WITH_TESTING)
+    add_unittest(serialization_test serialization_test.cc)
+    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
+endif()
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
-#!/bin/bash
-function cmake_gen() {
-    mkdir -p /paddle/build
-    cd /paddle/build
-    # build script will not fail if *.deb does not exist
-    rm *.deb 2>/dev/null || true
-    # delete previous built whl packages
-    rm -rf /paddle/paddle/dist 2>/dev/null || true
-    # Support build for all python versions, currently
-    # including cp27-cp27m and cp27-cp27mu.
-    PYTHON_FLAGS=""
-    if [ "$1" != "" ]; then
-        echo "using python abi: $1"
-        if [ "$1" == "cp27-cp27m" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-        elif [ "$1" == "cp27-cp27mu" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-        fi
-    fi
-    cat <<EOF
-    ========================================
-    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-        ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
-        -DWITH_DOC=${WITH_DOC:-OFF}
-        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
-        -DWITH_C_API=${WITH_C_API:-OFF}
-        -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-        -DCUDNN_ROOT=/usr/
-        -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-    ========================================
-EOF
-    # Disable UNITTEST_USE_VIRTUALENV in docker because
-    # docker environment is fully controlled by this script.
-    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
-        ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
-        -DWITH_DOC=${WITH_DOC:-OFF} \
-        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
-        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-        -DCUDNN_ROOT=/usr/ \
-        -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-}
-function run_build() {
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    make clean
-    make -j `nproc`
-}
-function run_test() {
-    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        ctest --output-on-failure
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
-    fi
-}
-function gen_docs() {
-    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
-        cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build_doc
-    ========================================
-EOF
-        mkdir -p /paddle/build_doc
-        pushd /paddle/build_doc
-        cmake .. \
-            -DWITH_DOC=ON \
-            -DWITH_GPU=OFF \
-            -DWITH_AVX=${WITH_AVX:-ON} \
-            -DWITH_SWIG_PY=ON
-        make -j `nproc` paddle_docs paddle_apis
-        popd
-    fi
-    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
-        cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-        export WOBOQ_OUT=/paddle/build/woboq_out
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
-            -b /paddle/build \
-            -a \
-            -o $WOBOQ_OUT \
-            -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-    fi
-}
-function gen_dockerfile() {
-    # Set BASE_IMAGE according to env variables
-    if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
-    else
-    BASE_IMAGE="ubuntu:16.04"
-    fi
-    DOCKERFILE_GPU_ENV=""
-    DOCKERFILE_CUDNN_DSO=""
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-    fi
-    cat <<EOF
-    ========================================
-    Generate /paddle/build/Dockerfile ...
-    ========================================
-EOF
-    cat > /paddle/build/Dockerfile <<EOF
-    FROM ${BASE_IMAGE}
-    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-    ENV HOME /root
-EOF
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
-    else
-        NCCL_DEPS=""
-    fi
-    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
-        PADDLE_VERSION="paddle version"
-        CMD='"paddle", "version"'
-    else
-        PADDLE_VERSION="true"
-        CMD='"true"'
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
-    ADD python/dist/*.whl /
-    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
-        pip install /*.whl; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f /*.whl && \
-        ${PADDLE_VERSION} && \
-        ldconfig
-    ${DOCKERFILE_CUDNN_DSO}
-    ${DOCKERFILE_GPU_ENV}
-    ENV NCCL_LAUNCH_MODE PARALLEL
-EOF
-    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD go/cmd/pserver/pserver /usr/bin/
-        ADD go/cmd/master/master /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
-    # default command shows the paddle version and exit
-    CMD [${CMD}]
-EOF
-}
-function gen_capi_package() {
-  if [[ ${WITH_C_API} == "ON" ]]; then
-    install_prefix="/paddle/build/capi_output"
-    rm -rf $install_prefix
-    make DESTDIR="$install_prefix" install
-    cd $install_prefix/usr/local
-    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
-  fi
-}
-function gen_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
-    cat <<EOF
-    ========================================
-    Deploying fluid inference library ...
-    ========================================
-EOF
-        make -j `nproc` inference_lib_dist
-    fi
-}
-set -xe
-cmake_gen ${PYTHON_ABI:-""}
-run_build
-run_test
-gen_docs
-gen_dockerfile
-gen_capi_package
-gen_fluid_inference_lib
-if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
-  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
-else
-  printf "If you need to install PaddlePaddle in develop docker image,"
-  printf "please make install or pip install build/python/dist/*.whl.\n"
-fi
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
-#!/bin/bash
-set -xe
-if [ $ANDROID_ABI == "arm64-v8a" ]; then
-  ANDROID_ARCH=arm64
-  if [ $ANDROID_API -lt 21 ]; then
-    echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-    ANDROID_API=21
-  fi
-else # armeabi, armeabi-v7a
-  ANDROID_ARCH=arm
-fi
-ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-cat <<EOF
-============================================
-Generating the standalone toolchain ...
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-      --arch=$ANDROID_ARCH
-      --platform=android-$ANDROID_API
-      --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-============================================
-EOF
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-      --arch=$ANDROID_ARCH \
-      --platform=android-$ANDROID_API \
-      --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-BUILD_ROOT=/paddle/build_android
-DEST_ROOT=/paddle/install_android
-mkdir -p $BUILD_ROOT
-cd $BUILD_ROOT
-if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_NEON=ON \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=ON \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=OFF \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-elif [ $ANDROID_ABI == "armeabi" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-else
-  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-fi
-cat <<EOF
-============================================
-Building in $BUILD_ROOT ...
-============================================
-EOF
-make -j `nproc`
-make install -j `nproc`
--- a/paddle/scripts/docker/entrypoint
+++ b/paddle/scripts/docker/entrypoint
-#!/bin/bash
-/usr/sbin/sshd -D &
-jupyter notebook --ip=0.0.0.0 /paddle/book/
--- a/paddle/scripts/docker/test.sh
+++ b/paddle/scripts/docker/test.sh
-#!/bin/bash
-set -e
-# the number of process to run tests
-NUM_PROC=6
-# calculate and set the memory usage for each process
-MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
-export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-# get the CUDA device count
-CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-for (( i = 0; i < $NUM_PROC; i++ )); do
-    cuda_list=()
-    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
-        s=$[i+j]
-        n=$[s%CUDA_DEVICE_COUNT]
-        if [ $j -eq 0 ]; then
-            cuda_list=("$n")
-        else
-            cuda_list="$cuda_list,$n"
-        fi
-    done
-    echo $cuda_list
-    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
-done
-wait
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -104,6 +104,8 @@ function cmake_gen() {
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_CONTRIB=ON
    ========================================
 EOF
    # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -129,7 +131,8 @@ EOF
        -DWITH_FAST_BUNDLE_TEST=ON \
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+        -DWITH_CONTRIB=ON
 }
 function abort(){

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
-#!/bin/bash
-set -e
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build
-cd $TRAVIS_BUILD_DIR/build
-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
-make -j `nproc` paddle_docs paddle_apis
-# check websites for broken links
-linkchecker doc/v2/en/html/index.html
-linkchecker doc/v2/cn/html/index.html
-linkchecker doc/v2/api/en/html/index.html
--- a/paddle/scripts/travis/build_ios.sh
+++ b/paddle/scripts/travis/build_ios.sh
-#!/bin/bash
-set -e
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_ios
-cd $TRAVIS_BUILD_DIR/build_ios
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=iOS \
-      -DIOS_PLATFORM=OS \
-      -DCMAKE_OSX_ARCHITECTURES="arm64" \
-      -DWITH_C_API=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_TESTING=OFF \
-      -DWITH_SWIG_PY=OFF \
-      -DCMAKE_BUILD_TYPE=Release \
-      ..
-make -j 2
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
-}
-trap 'abort' 0
-set -e
-# install glide
-curl https://glide.sh/get | bash
-eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-go get github.com/alecthomas/gometalinter
-gometalinter --install
-cd $TRAVIS_BUILD_DIR
-export PATH=/usr/bin:$PATH
-pre-commit install
-clang-format --version
-if ! pre-commit run -a ; then
-    git diff
-    exit 1
-fi
-trap : 0
--- a/paddle/scripts/travis/deploy_key.enc
+++ b/paddle/scripts/travis/deploy_key.enc
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -23,7 +23,7 @@ from ..executor import global_scope
 __all__ = [
    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'Preprocessor'
+    'random_data_generator', 'Preprocessor'
 ]

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -81,7 +81,7 @@ __all__ = [
    'label_smooth',
    'roi_pool',
    'dice_loss',
-    'bilinear_interp',
+    'upsampling_bilinear2d',
 ]
@@ -3917,8 +3917,10 @@ def dice_loss(input, label, epsilon=0.00001):
    return reduce_mean(dice_score)
-def bilinear_interp(input, out_h, out_w, name=None):
+def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
    """
+    The mathematical meaning of upsampling_bilinear2d is also called
+    Bilinear interpolation.
    Bilinear interpolation is an extension of linear interpolation for
    interpolating functions of two variables (e.g. H-direction and
    W-direction in this layer) on a rectilinear 2D grid.
@@ -3930,8 +3932,13 @@ def bilinear_interp(input, out_h, out_w, name=None):
        input (Variable): The input tensor of bilinear interpolation,
                          This is a 4-D tensor of the shape
                          (num_batches, channels, in_h, in_w).
-        out_h (int): output height of bilinear interpolation layer.
+        out_shape(list|tuple|None): Output shape of bilinear interpolation
-        out_w (int): output width of bilinear interpolation layer.
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
+        scale(int|None): The multiplier for the input height or width.
+                         At least one of out_shape or scale must be set.
+                         And out_shape has a higher priority than scale.
+                         Default: None
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
@@ -3942,10 +3949,27 @@ def bilinear_interp(input, out_h, out_w, name=None):
    Examples:
        .. code-block:: python
-            out = fluid.layers.bilinear_interp(input, out_h=12, out_w=12)
+            out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
    """
+    if out_shape is None and scale is None:
+        raise ValueError("One of out_shape and scale must not be None")
    helper = LayerHelper('bilinear_interp', **locals())
    dtype = helper.input_dtype()
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+    if out_shape is not None:
+        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2):
+            raise ValueError('out_shape should be a list or tuple ',
+                             'with length 2, (out_h, out_w).')
+        out_shape = list(map(int, out_shape))
+        out_h = out_shape[0]
+        out_w = out_shape[1]
+    else:
+        out_h = int(input.shape[2] * scale)
+        out_w = int(input.shape[3] * scale)
    out = helper.create_tmp_variable(dtype)
    helper.append_op(
        type="bilinear_interp",

--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -93,12 +93,12 @@ def _convert_lod(lod):
 def create_lod_tensor(data, lod, place):
-    """Create a lod tensor from a numpy array or an existing lod tensor.
+    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
    Create a lod tensor by doing the following:
    1. Check that the length-based input lod is valid.
    2. Convert the length-based lod to a offset-based LoD.
-    3. Copy the data from a numpy array or a existing lod tensor to 
+    3. Copy the data from a numpy array, a list or a existing lod tensor to 
       CPU or GPU device (based on input place).
    4. Set the level of detail (LoD) using the offset-based LoD.
@@ -117,7 +117,7 @@ def create_lod_tensor(data, lod, place):
    for more details regarding LoD.
    Args:
-        data: a numpy array or a LoDTensor holding the data to be copied.
+        data: a numpy array or a LoDTensor or a list holding the data to be copied.
        lod: a list of lists indicating the length-based LoD info specified by the user. 
        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
@@ -126,6 +126,18 @@ def create_lod_tensor(data, lod, place):
    """
    if isinstance(data, core.LoDTensor):
        return create_lod_tensor(np.array(data), lod, place)
+    elif isinstance(data, list):
+        # When input data is a list, it only deal with the case where the base element 
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
+        # of words or other indexes in the sequence. 
+        new_lod = []
+        for seq in data:
+            new_lod.append(len(seq))
+        assert [new_lod] == lod, "data and lod do not match"
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        return create_lod_tensor(flattened_data, lod, place)
    elif isinstance(data, np.ndarray):
        assert _validate_lod(lod,
                             data.shape[0]), "the provided lod info is invalid"
@@ -134,9 +146,8 @@ def create_lod_tensor(data, lod, place):
        tensor.set_lod(_convert_lod(lod))
        return tensor
    else:
-        raise Exception(
+        raise TypeError(
-            "data should be either a LoDTensor or a Numpy array, but you pass type %s instead"
+            "data should be either a LoDTensor, a Numpy array or a list")
-            % (type(data)))
 def create_random_int_lodtensor(lod, base_shape, place, low, high):

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -48,7 +48,7 @@ def linear():
    return avg_loss
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
@@ -68,8 +68,8 @@ def train(use_cuda, train_program, save_dirname):
                ['15.343549569447836']
                ...
                '''
-                if save_dirname is not None:
+                if params_dirname is not None:
-                    trainer.save_params(save_dirname)
+                    trainer.save_params(params_dirname)
                trainer.stop()
    trainer.train(
@@ -80,13 +80,13 @@ def train(use_cuda, train_program, save_dirname):
 # infer
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
-    if save_dirname is None:
+    if params_dirname is None:
        return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
    batch_size = 10
    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
@@ -100,10 +100,10 @@ def main(use_cuda):
        return
    # Directory for saving the trained model
-    save_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.inference.model"
-    train(use_cuda, linear, save_dirname)
+    train(use_cuda, linear, params_dirname)
-    infer(use_cuda, inference_program, save_dirname)
+    infer(use_cuda, inference_program, params_dirname)
 class TestFitALine(unittest.TestCase):

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -85,7 +85,7 @@ def train_network():
    return [avg_cost, accuracy]
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    BATCH_SIZE = 128
    EPOCH_NUM = 1
@@ -105,8 +105,8 @@ def train(use_cuda, train_program, save_dirname):
            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
            if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
+                if params_dirname is not None:
-                    trainer.save_params(save_dirname)
+                    trainer.save_params(params_dirname)
                return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -122,10 +122,10 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['pixel', 'label'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -142,12 +142,14 @@ def main(use_cuda):
    save_path = "image_classification_resnet.inference.model"
    train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -64,7 +64,7 @@ def train_network():
    return [avg_cost, accuracy]
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    BATCH_SIZE = 128
    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -82,8 +82,8 @@ def train(use_cuda, train_program, save_dirname):
            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
            if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
+                if params_dirname is not None:
-                    trainer.save_params(save_dirname)
+                    trainer.save_params(params_dirname)
                return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -99,10 +99,10 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['pixel', 'label'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -119,12 +119,14 @@ def main(use_cuda):
    save_path = "image_classification_vgg.inference.model"
    train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -141,7 +141,7 @@ def train_program():
    return [avg_cost]
-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.SGD(learning_rate=0.01)
@@ -172,7 +172,7 @@ def train(use_cuda, train_program, save_path):
            print("avg_cost: %s" % avg_cost)
            if float(avg_cost) < 100.0:  # Large value to increase CI speed
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
            else:
                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
                                                              float(avg_cost)))
@@ -183,7 +183,7 @@ def train(use_cuda, train_program, save_path):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
                trainer.stop()
    train_reader = paddle.batch(
@@ -197,10 +197,10 @@ def train(use_cuda, train_program, save_path):
        feed_order=feed_order)
-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        inference_program, param_path=save_path, place=place)
+        inference_program, param_path=params_dirname, place=place)
    # Setup inputs by creating LoDTensors to represent sequences of words.
    # Here each word is the basic element of these LoDTensors and the shape of 
@@ -251,9 +251,9 @@ def infer(use_cuda, inference_program, save_path):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "label_semantic_roles.inference.model"
+    params_dirname = "label_semantic_roles.inference.model"
-    train(use_cuda, train_program, save_path)
+    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, save_path)
+    infer(use_cuda, inference_program, params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -57,7 +57,7 @@ def train_program():
    return [avg_cost, acc]
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
@@ -78,7 +78,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)
            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
            else:
                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                    event.epoch + 1, avg_cost, acc))
@@ -100,11 +100,11 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['img', 'label'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
    batch_size = 1
    tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -116,17 +116,17 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
-    save_dirname = "recognize_digits_conv.inference.model"
+    params_dirname = "recognize_digits_conv.inference.model"
    # call train() with is_local argument to run distributed train
    train(
        use_cuda=use_cuda,
        train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -44,7 +44,7 @@ def train_program():
    return [avg_cost, acc]
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
@@ -62,7 +62,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)
            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
            else:
                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                    event.epoch + 1, avg_cost, acc))
@@ -81,11 +81,11 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['img', 'label'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
    batch_size = 1
    tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -97,17 +97,17 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
-    save_dirname = "recognize_digits_mlp.inference.model"
+    params_dirname = "recognize_digits_mlp.inference.model"
    # call train() with is_local argument to run distributed train
    train(
        use_cuda=use_cuda,
        train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -155,7 +155,7 @@ def train_program():
    return [avg_cost, scale_infer]
-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.SGD(learning_rate=0.2)
@@ -180,7 +180,7 @@ def train(use_cuda, train_program, save_path):
            print("avg_cost: %s" % avg_cost)
            if float(avg_cost) < 4:  # Smaller value to increase CI speed
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
                trainer.stop()
            else:
                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
@@ -197,43 +197,30 @@ def train(use_cuda, train_program, save_path):
        num_epochs=1,
        event_handler=event_handler,
        reader=train_reader,
-        feed_order=[
+        feed_order=feed_order)
-            'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id',
-            'category_id', 'movie_title', 'score'
-        ])
-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        inference_program, param_path=save_path, place=place)
+        inference_program, param_path=params_dirname, place=place)
-    def create_lod_tensor(data, lod=None):
+    # Use the first data from paddle.dataset.movielens.test() as input.
-        tensor = fluid.LoDTensor()
+    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
-        if lod is None:
+    # where `data` is a list of sequences of index numbers, `lod` is 
-            # Tensor, the shape is [batch_size, 1]
+    # the level of detail (lod) info associated with `data`.
-            index = 0
+    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
-            lod_0 = [index]
+    # two sequences of indexes, of length 3 and 2, respectively.
-            for l in range(len(data)):
+    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-                index += 1
+    # indicating that `data` consists of two sequences of length 3 and 2. 
-                lod_0.append(index)
+    user_id = fluid.create_lod_tensor([[1]], [[1]], place)
-            lod = [lod_0]
+    gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
-        tensor.set_lod(lod)
+    age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+    job_id = fluid.create_lod_tensor([[10]], [[1]], place)
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
+    movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
-        tensor.set(flattened_data, place)
+    movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
-        return tensor
+                                          place)
-    # Generate a random input for inference
-    user_id = create_lod_tensor([[1]])
-    gender_id = create_lod_tensor([[1]])
-    age_id = create_lod_tensor([[0]])
-    job_id = create_lod_tensor([[10]])
-    movie_id = create_lod_tensor([[783]])
-    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
-    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                    [[0, 5]])
    results = inferencer.infer(
        {
@@ -253,12 +240,15 @@ def infer(use_cuda, inference_program, save_path):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "recommender_system.inference.model"
+    params_dirname = "recommender_system.inference.model"
-    train(use_cuda=use_cuda, train_program=train_program, save_path=save_path)
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        save_path=save_path)
+        params_dirname=params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
 # default test
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -64,7 +64,7 @@ def train_program(word_dict):
    return [avg_cost, accuracy]
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
@@ -85,7 +85,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)
            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()
            else:
@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()
    train_reader = paddle.batch(
@@ -112,13 +112,13 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['words', 'label'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    word_dict = paddle.dataset.imdb.word_dict()
    inferencer = fluid.Inferencer(
        infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
        place=place)
    # Setup input by creating LoDTensor to represent sequence of words.
@@ -143,9 +143,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "understand_sentiment_conv.inference.model"
+    params_dirname = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, save_path)
+    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, save_path)
+    infer(use_cuda, inference_program, params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -79,7 +79,7 @@ def train_program(word_dict):
    return [avg_cost, accuracy]
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
@@ -100,7 +100,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)
            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()
            else:
@@ -112,7 +112,7 @@ def train(use_cuda, train_program, save_dirname):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()
    train_reader = paddle.batch(
@@ -127,13 +127,13 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['words', 'label'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    word_dict = paddle.dataset.imdb.word_dict()
    inferencer = fluid.Inferencer(
        infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
        place=place)
    # Setup input by creating LoDTensor to represent sequence of words.
@@ -158,9 +158,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "understand_sentiment_conv.inference.model"
+    params_dirname = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, save_path)
+    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, save_path)
+    infer(use_cuda, inference_program, params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -71,7 +71,7 @@ def train_program(word_dict):
    return [avg_cost, accuracy]
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
@@ -92,7 +92,7 @@ def train(use_cuda, train_program, save_dirname):
            print("acc     : %s" % acc)
            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()
            else:
@@ -104,7 +104,7 @@ def train(use_cuda, train_program, save_dirname):
            print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))
            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()
    train_reader = paddle.batch(
@@ -119,13 +119,13 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['words', 'label'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    word_dict = paddle.dataset.imdb.word_dict()
    inferencer = fluid.Inferencer(
        infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
        place=place)
    # Setup input by creating LoDTensor to represent sequence of words.
@@ -150,9 +150,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "understand_sentiment_stacked_lstm.inference.model"
+    params_dirname = "understand_sentiment_stacked_lstm.inference.model"
-    train(use_cuda, train_program, save_path)
+    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, save_path)
+    infer(use_cuda, inference_program, params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -80,7 +80,7 @@ def train_program(is_sparse):
    return avg_cost
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
    test_reader = paddle.batch(
@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
            print("loss= ", avg_cost)
            if avg_cost < 10.0:
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                trainer.stop()
            if math.isnan(avg_cost):
@@ -115,10 +115,10 @@ def train(use_cuda, train_program, save_dirname):
        feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
    # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
    # is simply an index to look up for the corresponding word vector and hence 
@@ -153,17 +153,17 @@ def main(use_cuda, is_sparse):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "word2vec.inference.model"
+    params_dirname = "word2vec.inference.model"
    train(
        use_cuda=use_cuda,
        train_program=partial(train_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)
    infer(
        use_cuda=use_cuda,
        inference_program=partial(inference_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
    test_reader = paddle.batch(
        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
-    feeding = {
+    feed_order = [
-        'user_id': 0,
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
-        'gender_id': 1,
+        'movie_title', 'score'
-        'age_id': 2,
+    ]
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-    def func_feed(feeding, data):
-        feed_tensors = {}
-        for (key, idx) in feeding.iteritems():
-            tensor = fluid.LoDTensor()
-            if key != "category_id" and key != "movie_title":
-                if key == "score":
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "float32")
-                else:
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "int64")
-            else:
-                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
-                                 data)
-                lod_info = [len(item) for item in numpy_data]
-                offset = 0
-                lod = [offset]
-                for item in lod_info:
-                    offset += item
-                    lod.append(offset)
-                numpy_data = np.concatenate(numpy_data, axis=0)
-                tensor.set_lod([lod])
-            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-            tensor.set(numpy_data, place)
-            feed_tensors[key] = tensor
-        return feed_tensors
    def train_loop(main_program):
        exe.run(framework.default_startup_program())
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch
                outs = exe.run(program=main_program,
-                               feed=func_feed(feeding, data),
+                               feed=feeder.feed(data),
                               fetch_list=[avg_cost])
                out = np.array(outs[0])
                if (batch_id + 1) % 10 == 0:
                    avg_cost_set = []
                    for test_data in test_reader():
-                        avg_cost_np = exe.run(
+                        avg_cost_np = exe.run(program=test_program,
-                            program=test_program,
+                                              feed=feeder.feed(test_data),
-                            feed=func_feed(feeding, test_data),
+                                              fetch_list=[avg_cost])
-                            fetch_list=[avg_cost])
                        avg_cost_set.append(avg_cost_np[0])
                        break  # test only 1 segment for speeding up CI
@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -307,26 +260,33 @@ def infer(use_cuda, save_dirname=None):
        # Use the first data from paddle.dataset.movielens.test() as input
        assert feed_target_names[0] == "user_id"
-        user_id = create_lod_tensor([[1]])
+        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
+        # where `data` is a list of sequences of index numbers, `lod` is 
+        # the level of detail (lod) info associated with `data`.
+        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+        # two sequences of indexes, of length 3 and 2, respectively.
+        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+        # indicating that `data` consists of two sequences of length 3 and 2. 
+        user_id = fluid.create_lod_tensor([[1]], [[1]], place)
        assert feed_target_names[1] == "gender_id"
-        gender_id = create_lod_tensor([[1]])
+        gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
        assert feed_target_names[2] == "age_id"
-        age_id = create_lod_tensor([[0]])
+        age_id = fluid.create_lod_tensor([[0]], [[1]], place)
        assert feed_target_names[3] == "job_id"
-        job_id = create_lod_tensor([[10]])
+        job_id = fluid.create_lod_tensor([[10]], [[1]], place)
        assert feed_target_names[4] == "movie_id"
-        movie_id = create_lod_tensor([[783]])
+        movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
        assert feed_target_names[5] == "category_id"
-        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+        category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
        assert feed_target_names[6] == "movie_title"
-        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+        movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
-                                        [[0, 5]])
+                                              [[5]], place)
        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
        # and results will contain a list of data corresponding to fetch_targets.

--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -53,11 +53,14 @@ class TestLoDTensor(unittest.TestCase):
        self.assertEqual(_convert_lod(lod), converted_lod)
    def test_create_lod_tensor(self):
-        # Only numpy array or a fluid LoDTensor is valid input to
+        # Create LoDTensor from a list
-        # create_lod_tensor function, currently a list of lists is not.
+        data = [[1, 2, 3], [3, 4]]
-        data = [[1, 2], [3, 4]]
+        wrong_lod = [[2, 2]]
-        self.assertRaises(Exception, create_lod_tensor, data, [],
+        correct_lod = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
                          fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 3, 5]])
        # Create LoDTensor from numpy array
        data = numpy.random.random([10, 1])

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -479,9 +479,9 @@ class OpTest(unittest.TestCase):
    def np_dtype_to_fluid_dtype(input):
        """Change the dtype of float16 numpy array
-        numpy float16 is binded to paddle::platform::float16 
+        numpy float16 is binded to paddle::platform::float16
        in tensor_py.h via the help of uint16 data type since
-        the internal memory representation of float16 is 
+        the internal memory representation of float16 is
        uint16_t in paddle and np.uint16 in numpy, which are
        themselves binded together by pybind.
@@ -489,9 +489,9 @@ class OpTest(unittest.TestCase):
            input: input numpy array
        Returns:
-            input: The dtype of input will be changed to np.uint16 if 
+            input: The dtype of input will be changed to np.uint16 if
                it is originally np.float16, such that the internal memory
-                of input will be reinterpreted as of dtype np.uint16. 
+                of input will be reinterpreted as of dtype np.uint16.
        """
        if input.dtype == np.float16:
            input.dtype = np.uint16

--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
        self.check_output()
+class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {
+            'Input': (np.random.random((31, 28)).astype("float32"),
+                      [[0, 9, 23, 31]])
+        }
+        self.attrs = {
+            'value': 3.5,
+            'shape': [-1, 16],
+            'input_dim_idx': 0,
+            'output_dim_idx': 0
+        }
+        out = np.random.random((3, 16)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,11 +369,13 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))
-    def test_bilinear_interp(self):
+    def test_upsampling_bilinear2d(self):
        program = Program()
        with program_guard(program):
            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.bilinear_interp(x, 12, 12)
+            output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
+            self.assertIsNotNone(output)
+            output = layers.upsampling_bilinear2d(x, scale=3)
            self.assertIsNotNone(output)
        print(str(program))

--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+def PolygonBoxRestore(input):
+    shape = input.shape
+    batch_size = shape[0]
+    geo_channels = shape[1]
+    h = shape[2]
+    w = shape[3]
+    h_indexes = np.array(range(h) * w).reshape(
+        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
+    w_indexes = np.array(range(w) * h).reshape(
+        [h, w])[np.newaxis, :]  # [1, h, w]
+    indexes = np.concatenate(
+        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
+    indexes = indexes.repeat(
+        [geo_channels / 2],
+        axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
+    indexes = indexes.repeat(
+        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
+    return indexes.reshape(
+        input.shape) - input  # [batch_size, geo_channels, h, w]
+class TestPolygonBoxRestoreOp(OpTest):
+    def config(self):
+        self.input_shape = (1, 8, 2, 2)
+    def setUp(self):
+        self.config()
+        self.op_type = "polygon_box_transform"
+        input = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'Input': input}
+        output = PolygonBoxRestore(input)
+        self.outputs = {'Output': output}
+    def test_check_output(self):
+        self.check_output()
+class TestCase1(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (2, 10, 3, 2)
+class TestCase2(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (3, 12, 4, 5)
+if __name__ == '__main__':
+    unittest.main()