Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-default-value

Change the default value of the parameter 'drop_last' in 'paddle.batch' to False.

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-default-value
Change the default value of the parameter 'drop_last' in 'paddle.batch' to False.
7b7a4afa · Zhen Wang · 18691901 · 16403342 · 7b7a4afa · 7b7a4afa
180 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,6 +66,12 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
+
+# PY_VERSION
+if(NOT PY_VERSION)
+  set(PY_VERSION 2.7)
+endif()

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -146,6 +152,7 @@ endif()
 ########################################################################################

 include(external/mklml)     # download mklml package
+include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -232,6 +239,10 @@ if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()

+if(WITH_LIBXSMM)
+    list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
+endif()
+
 if(WITH_MKLDNN)
    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
@@ -271,7 +282,3 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
-
-if (WITH_CONTRIB)
-    add_subdirectory(paddle/contrib)
-endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
    pip install opencv-python

 #For docstring checker
-RUN pip install pylint pytest astroid isort
+RUN pip install pylint pytest astroid isort LinkChecker

 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt

--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -210,7 +210,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
-            v = startup_prog.global_block().clone_variable(var)
+            v = startup_prog.global_block()._clone_variable(var)
            var.persistable = True
            v.persistable = True


--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
+
+IF(NOT WITH_LIBXSMM)
+    return()
+ENDIF()
+
+IF(WIN32 OR APPLE OR ANDROID OR IOS)
+    MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+    SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
+    return()
+ENDIF()
+
+INCLUDE (ExternalProject)
+
+SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
+SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
+SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
+SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
+SET(LIBXSMM_LIBS        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
+                        "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+ExternalProject_Add(
+    extern_libxsmm
+    GIT_REPOSITORY  "https://github.com/hfp/libxsmm.git"
+    GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
+    PREFIX          ${LIBXSMM_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
+    INSTALL_COMMAND ""
+)
+ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
+include_directories(${LIBXSMM_INCLUDE_DIR})
+ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
+ADD_DEPENDENCIES(libxsmm extern_libxsmm)
+LIST(APPEND external_project_dependencies libxsmm)
+
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -121,6 +121,11 @@ ELSE()
  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")

+IF(WITH_LIBXSMM)
+  TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
+  ADD_DEPENDENCIES(cblas extern_libxsmm)
+ENDIF()
+
 IF(NOT ${CBLAS_FOUND})
    ADD_DEPENDENCIES(cblas extern_openblas)
    LIST(APPEND external_project_dependencies cblas)

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,9 @@ ENDIF()

 INCLUDE(python_module)

-FIND_PACKAGE(PythonInterp 2.7)
-FIND_PACKAGE(PythonLibs 2.7)
+FIND_PACKAGE(PythonInterp ${PY_VERSION})
+FIND_PACKAGE(PythonLibs ${PY_VERSION})
+
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
 ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -138,25 +138,24 @@ copy(memory_lib

 set(inference_deps paddle_fluid_shared paddle_fluid)

-if(WITH_CONTRIB)
-    message(STATUS "installing contrib")
-    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-    if (WITH_ANAKIN AND WITH_GPU)
-        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
-            SRCS
-            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
-            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
-            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
-        list(APPEND inference_deps contrib_anakin_inference_lib)
-   endif()
-
-  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
-        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
-  list(APPEND inference_deps contrib_inference_lib)
+set(module "inference/api")
+if (WITH_ANAKIN AND WITH_GPU)
+    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+        SRCS
+        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
+        ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+     list(APPEND inference_deps anakin_inference_lib)
 endif()

+copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
+  SRCS ${src_dir}/${module}/paddle_inference_api.h 
+       ${src_dir}/${module}/demo_ci
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
+  DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
+)
+list(APPEND inference_deps inference_api_lib)
+
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*

--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -98,13 +98,13 @@ class Block(objects):
    def append_operator(self, ...):
        self.ops.append(Operator(self, ...))

-    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+    def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
       self.ops.prepend(Operator(self, ...))
 ```

 `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.

-`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.

 ### Operator


--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
@@ -78,7 +78,7 @@ def error_clip_callback(block, context):
    op_desc = block.desc.op(block.desc.op_size() - 1)
    for grad_n in filter(lambda n: grad_to_var.has_key(n),
                         op_desc.output_arg_names()):
-        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        fwd_var = block.__var_recursive(grad_to_var[grad_n])
        error_clip = getattr(fwd_var, "error_clip", None)
        if not (error_clip is None or isinstance(error_clip,
                                                 BaseErrorClipAttr)):

--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -4,7 +4,6 @@ API
 ..  toctree::
    :maxdepth: 1

-    overview.rst
    model_configs.rst
    data.rst
    run_logic.rst
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.

 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：


--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -36,13 +36,18 @@ If you don't wish to use docker，you need to install several compile dependenci
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

-NOTE: The above command try to mount the current working directory (root directory of source code)
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
 into :code:`/paddle` directory inside docker container.

+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
 machine or copy it to the target machine.

--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -118,7 +118,7 @@ class Float16Transpiler:

        for var in self.block.vars.keys():
            if var not in args:
-                self.block.remove_var(var)
+                self.block._remove_var(var)

    def _modify_feed_fetch(self):
        '''
@@ -165,7 +165,7 @@ class Float16Transpiler:
                    dtype=core.VarDesc.VarType.FP16,
                    shape=var.shape,
                    persistable=var.persistable)
-                self.block.insert_op(
+                self.block._insert_op(
                    i + 1,
                    type="cast",
                    inputs={"X": var},
@@ -188,7 +188,7 @@ class Float16Transpiler:
                    persistable=var.persistable)
                find_op(var)
                var.op.rename_output(var_name, tmp_var_name)
-                self.block.insert_op(
+                self.block._insert_op(
                    i,
                    type="cast",
                    inputs={"X": tmp_var},
@@ -253,4 +253,4 @@ class Float16Transpiler:

            # old var will be replaced by the fp16 var in program desc
            self.input_map[var.name] = fp16_var_name
-            self.block.remove_var(var.name)
+            self.block._remove_var(var.name)
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-inference_api_test(simple_on_word2vec ARGS test_word2vec)
-
-option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
-if(NOT WITH_INFERENCE_DEMO)
-  return()
-endif()
-
-set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
-set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
-
-function(inference_download_test_demo TARGET)
-    if (NOT WITH_TESTING)
-        return()
-    endif()
-    set(options "")
-    set(oneValueArgs URL)
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
-    message(STATUS "inference demo ${test_dir}")
-
-    if(NOT EXISTS "${test_dir}")
-        message(STATUS "Download ${TARGET} model from ${tests_URL}")
-        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
-    endif()
-
-    cc_test(${TARGET} SRCS "${tests_SRCS}"
-        DEPS paddle_inference_api paddle_fluid
-        ARGS --data=${test_dir}/data.txt
-             --modeldir=${test_dir}/model
-             --refer=${test_dir}/result.txt)
-endfunction()
-
-# disable mobilenet test
-#inference_download_test_demo(mobilenet_inference_demo
-#    SRCS vis_demo.cc
-#    URL ${URL_ROOT}mobilenet.tar.gz)
-inference_download_test_demo(se_resnext50_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}se_resnext50.tar.gz)
-inference_download_test_demo(ocr_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}ocr.tar.gz)
--- a/paddle/contrib/inference/demo/README.md
+++ b/paddle/contrib/inference/demo/README.md
-# Infernce Demos
-
-Input data format:
-
- Each line contains a single record
- Each record's format is
-
-```
-<space splitted floats as data>\t<space splitted ints as shape>
-```
-
-Follow the C++ codes in `vis_demo.cc`.
-
-## MobileNet
-
-To execute the demo, simply run
-
-```sh
-./mobilenet_inference_demo --modeldir <model> --data <datafile>
-```
-
-## SE-ResNeXt-50
-
-To execute the demo, simply run
-
-```sh
-./se_resnext50_inference_demo --modeldir <model> --data <datafile>
-```
-
-## OCR
-
-To execute the demo, simply run
-
-```sh
-./ocr_inference_demo --modeldir <model> --data <datafile>
-```
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -276,13 +276,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    }
  }

-  // Insert BCast Ops
-  for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-    auto &to_bcast_set = bcast_var_name_set[dev_id];
-    for (auto &bcast_name : to_bcast_set) {
-      CreateBroadcastOp(&result, bcast_name, dev_id);
+  bool use_gpu = false;
+#ifdef PADDLE_WITH_CUDA
+  use_gpu = nccl_ctxs_ != nullptr;
+#endif
+
+  if (use_gpu ||
+      strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+    // Insert BCast Ops
+    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+      auto &to_bcast_set = bcast_var_name_set[dev_id];
+      for (auto &bcast_name : to_bcast_set) {
+        CreateBroadcastOp(&result, bcast_name, dev_id);
+      }
    }
  }
+
  /*
    Dependency graph has been constructed. However, there are still data
    hazards need to be handled.
@@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
    return -1;
  }
-
-  for (auto &varname : op.InputArgumentNames()) {
-    int dev_id = GetVarDeviceID(varname);
-    if (dev_id != -1) {
-      return dev_id;
-    }
+  int op_role = boost::get<int>(
+      op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+  if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
+    return -1;
  }
-  return -1;
+  auto param_grad = boost::get<std::vector<std::string>>(
+      op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(),
+                    param_grad[0]);
+  return dev_id;
 }

 int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
      }
    }
  }
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }

-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
  drop_scope_counter_ += 1;
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
@@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
      scope->DeleteScope(local_scope);
    }
  }
-  return fetch_data;
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    set.clear();
  };

+  // Clean run context
+  run_op_futures_.clear();
+  exception_.reset();
+
  // Step 3. Execution
  while (!pending_vars.empty()) {
    // 1. Run All Ready ops
@@ -96,16 +100,19 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);

    if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
+      std::unique_lock<std::mutex> l(exception_mu_);
      if (exception_) {
+        l.unlock();
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
+        }
+        l.lock();
        std::exception *exp = exception_.get();
        if (dynamic_cast<platform::EOFException *>(exp)) {
          auto e = *static_cast<platform::EOFException *>(exp);
-          exception_.reset();
          throw e;
        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          exception_.reset();
          throw e;
        } else {
          LOG(FATAL) << "Unknown exception.";
@@ -222,7 +229,7 @@ void ThreadedSSAGraphExecutor::RunOp(
    }
  };
  if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
  } else {
    op_run();
  }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {

 private:
  ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
 };

 }  // namespace details

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -45,6 +45,7 @@ class ParallelExecutorPrivate {
 #endif
  bool own_local_scope_;
  bool use_cuda_;
+  bool use_all_reduce_;
 };

 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor(
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
+  member_->use_all_reduce_ =
+      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
+
+  if (!member_->use_all_reduce_) {
+    PADDLE_ENFORCE(places.size() > 1,
+                   "If you set build_strategy.reduce with 'Reduce',"
+                   "the number of places must be greater than 1.");
+  }

  // Step 1. Bcast the params to devs.
  // Create local scopes
@@ -95,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
  }

  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToGPUs(bcast_vars);
+    BCastParamsToDevices(bcast_vars);
  }
  // Startup Program has been run. All local scopes has correct parameters.

@@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor(
 #ifdef PADDLE_WITH_CUDA
    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #else
-    PADDLE_THROW("Not compiled with CUDA");
+    PADDLE_THROW("Not compiled with CUDA.");
 #endif
  }

@@ -131,9 +140,9 @@ ParallelExecutor::ParallelExecutor(
      member_->places_, std::move(member_->executor_)));
 }

-void ParallelExecutor::BCastParamsToGPUs(
+void ParallelExecutor::BCastParamsToDevices(
    const std::unordered_set<std::string> &vars) const {
-  // the the initializing bcast, all vars would be bcast from device(0),
+  // the initializing bcast, all vars would be bcast from device(0),
  // otherwise
  // bcast from the specified device.
  bool initializing = builder_.get() == nullptr ? true : false;
@@ -202,12 +211,23 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
    } else {
      platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id))
+          continue;
+
        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-        t->Resize(dims);
-        t->mutable_data(cpu, main_tensor.type());
-        paddle::framework::TensorCopy(main_tensor, cpu, t);
+
+        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
+        if (member_->use_all_reduce_ || member_->use_cuda_ ||
+            var == "@LR_DECAY_COUNTER@") {
+          t->Resize(dims);
+          t->mutable_data(cpu, main_tensor.type());
+          paddle::framework::TensorCopy(main_tensor, cpu, t);
+        } else {
+          t->ShareDataWith(main_tensor);
+        }
      }
    }
  }

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -66,7 +66,7 @@ class ParallelExecutor {
  void Run(const std::vector<std::string> &fetch_tensors,
           const std::string &fetched_var_name);

-  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
+  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;

 private:
  ParallelExecutorPrivate *member_;

--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -29,11 +29,11 @@ enum ReaderStatus { kRunning, kStopped };

 class ReaderBase {
 public:
-  void ReadNext(std::vector<LoDTensor>* out);
+  virtual void ReadNext(std::vector<LoDTensor>* out);

-  void Shutdown();
+  virtual void Shutdown();

-  void Start();
+  virtual void Start();

  // Return the readers which are the end of decorating chain. Basically
  // they are readers just before read op.
@@ -42,7 +42,7 @@ class ReaderBase {
  virtual ~ReaderBase();

 protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}

  virtual void ShutdownImpl() {}


--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
+# analysis and tensorrt must be added before creating static library,
+# otherwise, there would be undefined reference to them in static library.
+add_subdirectory(analysis)
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
+endif()
+
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)

 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
@@ -7,12 +14,14 @@ cc_library(paddle_fluid_api

 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)

-if(WITH_CONTRIB)
-  set(fluid_modules "${fluid_modules}" paddle_inference_api)
-endif()
-
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+if(NOT APPLE)
+  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
+
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc
@@ -29,9 +38,4 @@ if(WITH_TESTING)
  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
 endif()
-
-add_subdirectory(analysis)
-
-if (TENSORRT_FOUND)
-  add_subdirectory(tensorrt)
-endif()
+add_subdirectory(api)
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const {
  return dot.Build();
 }

+std::string DataFlowGraph::HumanReadableInfo(bool show_values,
+                                             bool show_functions) const {
+  std::stringstream values, functions;
+  for (auto &n : nodes.nodes()) {
+    if (show_values && n->IsValue()) {
+      values << n->repr() << "\n";
+    }
+    if (show_functions && n->IsFunction()) {
+      functions << n->repr() << "\n";
+    }
+  }
+  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
+}
+
 //
 // NodesBFSIterator
 //
@@ -146,7 +160,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
  if ((!queue_.empty()) && (!other.queue_.empty())) {
    return queue_.front() == other.queue_.front() &&
           visited_.size() == other.visited_.size();  // here need to check the
-                                                      // equality of queue and
+    // equality of queue and
    // visited. Just a light but week implementation.
  }
  return false;
@@ -208,6 +222,76 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
  return stack_.top();
 }

+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      inlink_visited.clear();
+
+      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) { return visited.count(x); });
+
+      if (inlink_visited.size() == p->inlinks.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outlinks) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
+    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
+GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -48,6 +48,9 @@ struct DataFlowGraph {
  // Output a DOT graph file for debug.
  std::string DotString() const;

+  std::string HumanReadableInfo(bool show_values = true,
+                                bool show_functions = true) const;
+
 private:
  // Remove duplicate edges and so on.
  void Clean();
@@ -107,6 +110,32 @@ struct GraphTraits<DataFlowGraph> {
    std::unordered_set<Node *> visited_;
  };

+  // Topological sorting iterator on nodes.
+  struct NodesTSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesTSIterator() = default;
+    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(NodesTSIterator &&other)
+        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+      other.cursor_ = 0;
+    }
+    NodesTSIterator(const NodesTSIterator &other);
+
+    Node &operator*();
+    NodesTSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesTSIterator &operator=(const NodesTSIterator &other);
+    bool operator==(const NodesTSIterator &other);
+    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::vector<Node *> sorted_;
+    int cursor_{0};
+  };
+
  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}

  // default use BFS to visit the nodes.
@@ -119,17 +148,24 @@ struct GraphTraits<DataFlowGraph> {
  iterator_range<NodesDFSIterator> nodes_in_DFS() {
    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
  }
+  iterator_range<NodesTSIterator> nodes_in_TS() {
+    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
+  }

 private:
  NodesBFSIterator nodes_bfs_begin() {
    return NodesBFSIterator(graph_->inputs);
  }
  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+
  NodesDFSIterator nodes_dfs_begin() {
    return NodesDFSIterator(graph_->inputs);
  }
  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }

+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
+
 private:
  DataFlowGraph *graph_;
 };

--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) {
  auto dfg = ProgramDescToDFG(desc);
  dfg.Build();

-  for (auto* in : dfg.inputs) {
+  for (auto *in : dfg.inputs) {
    LOG(INFO) << "inputs: " << in->name() << " "
              << static_cast<int>(in->type());
  }
-  for (auto* out : dfg.outputs) {
+  for (auto *out : dfg.outputs) {
    LOG(INFO) << "outputs: " << out->name() << " "
              << static_cast<int>(out->type());
  }
@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) {
  ASSERT_EQ(count, dfg.nodes.size());
 }

+// Topological sorting.
+/*
+ * Graph topology
+ * inputs: 0, 1, 2
+ * 0 -> 4
+ * 0 -> 5
+ * 1 -> 6
+ * 2 -> 7
+ * 4 -> 5
+ * 4 -> 7
+ * 4 -> 3
+ * 7 -> 3
+ */
+TEST(DataFlowGraph, TS) {
+  DataFlowGraph graph;
+
+  for (int i = 0; i < 8; i++) {
+    auto *node = graph.nodes.Create(Node::Type::kValue);
+    node->SetName("node-" + std::to_string(i));
+  }
+
+  auto add_link = [&](int i, int j) {
+    Node *source = graph.nodes.GetMutable(i);
+    Node *target = graph.nodes.GetMutable(j);
+    target->inlinks.push_back(source);
+    source->outlinks.push_back(target);
+  };
+
+  graph.inputs.push_back(graph.nodes.GetMutable(0));
+  graph.inputs.push_back(graph.nodes.GetMutable(1));
+  graph.inputs.push_back(graph.nodes.GetMutable(2));
+
+  add_link(0, 4);
+  add_link(0, 5);
+  add_link(1, 6);
+  add_link(2, 7);
+  add_link(4, 5);
+  add_link(4, 7);
+  add_link(4, 3);
+  add_link(7, 3);
+
+  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  std::vector<int> sorted_ids;
+  for (auto it = its.begin(); it != its.end(); ++it) {
+    LOG(INFO) << it->name();
+    sorted_ids.push_back(it->id());
+  }
+
+  // Assert a occurs prior to b in the sorted_ids.
+  auto assert_positive_sequence_pair = [&](int a, int b) {
+    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
+    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
+    ASSERT_LT(a_offset, b_offset);
+  };
+
+  assert_positive_sequence_pair(2, 7);
+  assert_positive_sequence_pair(7, 3);
+  assert_positive_sequence_pair(4, 3);
+  assert_positive_sequence_pair(0, 4);
+  assert_positive_sequence_pair(0, 5);
+  assert_positive_sequence_pair(1, 6);
+  assert_positive_sequence_pair(4, 5);
+  assert_positive_sequence_pair(4, 7);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -43,53 +43,64 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)

 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    SRCS api.cc api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+if(NOT APPLE)
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym")
+  set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()

 # Here the shared library doesn't depend on other fluid libraries, or double free will occur.
 cc_library(paddle_inference_api_shared SHARED
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc)
+    SRCS api.cc api_impl.cc)
 add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)

 if(NOT APPLE)
-  set(LINK_FLAGS "-fPIC -fvisibility=hidden")
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map")
  set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_inference_api_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()

 cc_test(test_paddle_inference_api
-        SRCS test_paddle_inference_api.cc
+        SRCS test_api.cc
        DEPS paddle_inference_api)

-inference_api_test(test_paddle_inference_api_impl
+inference_api_test(test_api_impl
                    ARGS test_word2vec test_image_classification)

 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
-        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+        SRCS api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)

-inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
 endif()

 if (WITH_ANAKIN) # only needed in CI
    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
    # compile the libinference_anakin_api.a and compile with anakin.so.
-    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
+    nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
    if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
                                  DEPS inference_anakin_api)
        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endif(WITH_TESTING)
 endif()
-
-if(WITH_TESTING)
-    add_subdirectory(demo)
-endif()
--- a/paddle/contrib/inference/README.md
+++ b/paddle/contrib/inference/README.md
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"

 namespace paddle {


--- a/paddle/fluid/inference/api/api.map
+++ b/paddle/fluid/inference/api/api.map
+{
+	global:
+		*paddle*;
+	local:
+		*;
+};
--- a/paddle/fluid/inference/api/api.sym
+++ b/paddle/fluid/inference/api/api.sym
+*paddle*
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+#include "paddle/fluid/inference/api/api_anakin_engine.h"
 #include <cuda.h>
+#include <vector>

 namespace paddle {

@@ -47,13 +48,13 @@ bool PaddleInferenceAnakinPredictor::Run(
    }
    auto d_tensor_in_p = executor_.get_in(input.name);
    float *d_data_p = d_tensor_in_p->mutable_data();
-    if (cudaMemcpy(d_data_p,
-                   static_cast<float *>(input.data.data()),
+    if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
                   d_tensor_in_p->valid_size() * sizeof(float),
                   cudaMemcpyHostToDevice) != 0) {
      LOG(ERROR) << "copy data from CPU to GPU error";
      return false;
    }
+    cudaStreamSynchronize(NULL);
  }

  executor_.prediction();
@@ -69,13 +70,13 @@ bool PaddleInferenceAnakinPredictor::Run(
      output.data.Resize(tensor->valid_size() * sizeof(float));
    }
    // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data(),
-                   tensor->mutable_data(),
+    if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
                   tensor->valid_size() * sizeof(float),
                   cudaMemcpyDeviceToHost) != 0) {
      LOG(ERROR) << "copy data from GPU to CPU error";
      return false;
    }
+    cudaStreamSynchronize(NULL);
  }
  return true;
 }
@@ -104,13 +105,12 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {

 // A factory to help create difference predictor.
 template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
-    const AnakinConfig &config) {
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
  VLOG(3) << "Anakin Predictor create.";
  std::unique_ptr<PaddlePredictor> x(
      new PaddleInferenceAnakinPredictor(config));
  return x;
-};
+}

 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -19,7 +19,8 @@ limitations under the License. */

 #pragma once

-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <vector>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"

 // from anakin
 #include "framework/core/net/net.h"
@@ -31,7 +32,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 public:
  PaddleInferenceAnakinPredictor() {}

-  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);

  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
  // should be allocated first.
@@ -48,8 +49,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 private:
  bool Init(const AnakinConfig& config);

-  anakin::graph::Graph<anakin::NV,
-                       anakin::saber::AK_FLOAT,
+  anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT,
                       anakin::Precision::FP32>
      graph_;
  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>

-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"

 DEFINE_string(model, "", "Directory of the inference model.");


--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>

-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/api/api_impl.h"

 namespace paddle {
 namespace {
@@ -77,8 +77,8 @@ bool NativePaddlePredictor::Init(
  if (!config_.model_dir.empty()) {
    // Parameters are saved in separate files sited in
    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.model_dir);
+    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
+                                                 config_.model_dir);
  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
    // All parameters are saved in a single file.
    // The file names should be consistent with that used
@@ -91,8 +91,8 @@ bool NativePaddlePredictor::Init(
  }

  ctx_ = executor_->Prepare(*inference_program_, 0);
-  executor_->CreateVariables(
-      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+  executor_->CreateVariables(*inference_program_,
+                             sub_scope_ ? sub_scope_ : scope_.get(), 0);

  // Get the feed_target_names and fetch_target_names
  feed_target_names_ = inference_program_->GetFeedTargetNames();
@@ -105,7 +105,7 @@ NativePaddlePredictor::~NativePaddlePredictor() {
    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
    scope_->DeleteScope(sub_scope_);
  }
-};
+}

 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data) {
@@ -134,10 +134,8 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  // if share variables, we need not create variables
  VLOG(4) << "Run prepared context";
  executor_->RunPreparedContext(
-      ctx_.get(),
-      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
-      &feed_targets,
-      &fetch_targets,
+      ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
+      &feed_targets, &fetch_targets,
      false /* don't create variable eatch time */);
  VLOG(4) << "Finish prepared context";
  if (!GetFetch(fetchs, output_data)) {
@@ -181,8 +179,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
    }

    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr),
-                inputs[i].data.data(),
+    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                inputs[i].data.length());
    feeds->push_back(input);
  }
@@ -232,8 +229,7 @@ bool NativePaddlePredictor::GetFetch(
        size_t start = lod[0][j - 1] * common_dim;
        size_t end = lod[0][j] * common_dim;
        if (end > start) {
-          std::copy(output_ptr + start,
-                    output_ptr + end,
+          std::copy(output_ptr + start, output_ptr + end,
                    data.begin() + (j - 1) * max_dim * common_dim);
        }
      }
@@ -257,15 +253,13 @@ bool NativePaddlePredictor::GetFetch(
 }

 template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-    const NativeConfig &config) {
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
  VLOG(3) << "create NativePaddlePredictor";
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory,
-        0.f,
+        config.fraction_of_gpu_memory, 0.f,
        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
    std::vector<std::string> flags;

--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>

-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"

 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"

--- a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/contrib/inference/paddle_inference_api.h"
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"

 namespace paddle {
@@ -77,8 +77,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
    ctx_ = executor_->Prepare(*inference_program_, 0);

    VLOG(5) << "to create variables";
-    executor_->CreateVariables(
-        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+    executor_->CreateVariables(*inference_program_,
+                               sub_scope_ ? sub_scope_ : scope_.get(), 0);

    // Get the feed_target_names and fetch_target_names
    feed_target_names_ = inference_program_->GetFeedTargetNames();
@@ -98,8 +98,7 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory,
-        0.f,
+        config.fraction_of_gpu_memory, 0.f,
        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
    std::vector<std::string> flags;

--- a/paddle/fluid/inference/api/check_symbol.sh
+++ b/paddle/fluid/inference/api/check_symbol.sh
+#!/bin/bash
+
+lib=$1
+if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
+
+num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
+
+if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
+if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
+
+exit 0
--- a/paddle/fluid/inference/api/demo_ci/.gitignore
+++ b/paddle/fluid/inference/api/demo_ci/.gitignore
+data
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+cmake_minimum_required(VERSION 3.0)
+
+project(cpp_inference_demo CXX C)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+
+if(WITH_GPU)
+  set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+endif()
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
+endif()
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf snappystream snappy z
+    ${EXTERNAL_LIB})
+if(WITH_GPU)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
--- a/paddle/fluid/inference/api/demo_ci/README.md
+++ b/paddle/fluid/inference/api/demo_ci/README.md
+# Inference Demos
+
+There are several demos:
+
+- simple_on_word2vec: 
+  - Follow the C++ codes is in `simple_on_word2vec.cc`. 
+  - It is suitable for word2vec model.
+- vis_demo: 
+  - Follow the C++ codes is in `vis_demo.cc`. 
+  - It is suitable for mobilenet, se_resnext50 and ocr three models.
+  - Input data format:
+    - Each line contains a single record
+    - Each record's format is
+    ```
+    <space splitted floats as data>\t<space splitted ints as shape>
+    ```
+
+To build and execute the demos, simply run 
+```
+./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU
+```
+- It will build and execute the demos in both static and shared library.
+- `$PADDLE_ROOT`: paddle library path
+- `$TURN_ON_MKL`: use MKL or Openblas
+- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode
+- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first.
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
+set -x
+PADDLE_ROOT=$1
+TURN_ON_MKL=$2 # use MKL or Openblas
+TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
+if [ $2 == ON ]; then
+  # You can export yourself if move the install path
+  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
+fi
+if [ $3 == ON ]; then
+  use_gpu_list='true false'
+else    
+  use_gpu_list='false'
+fi
+
+# download vis_demo data
+function download() {
+  dir_name=$1
+  mkdir -p $dir_name
+  cd $dir_name
+  wget -q ${URL_ROOT}$dir_name.tar.gz
+  tar xzf *.tar.gz
+  cd ..
+}
+URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
+mkdir -p data
+cd data
+vis_demo_list='se_resnext50 ocr mobilenet'
+for vis_demo_name in $vis_demo_list; do
+  download $vis_demo_name
+done
+cd ..
+
+# compile and test the demo
+mkdir -p build
+cd build
+
+for WITH_STATIC_LIB in ON OFF; do
+  # -----simple_on_word2vec-----
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$TURN_ON_MKL \
+    -DDEMO_NAME=simple_on_word2vec \
+    -DWITH_GPU=$TEST_GPU_CPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make -j
+  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
+  if [ -d $word2vec_model ]; then
+    for use_gpu in $use_gpu_list; do
+      ./simple_on_word2vec \
+        --dirname=$word2vec_model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+  fi
+  # ---------vis_demo---------
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$TURN_ON_MKL \
+    -DDEMO_NAME=vis_demo \
+    -DWITH_GPU=$TEST_GPU_CPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make -j
+  for use_gpu in $use_gpu_list; do
+    for vis_demo_name in $vis_demo_list; do 
+      ./vis_demo \
+        --modeldir=../data/$vis_demo_name/model \
+        --data=../data/$vis_demo_name/data.txt \
+        --refer=../data/$vis_demo_name/result.txt \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "vis demo $vis_demo_name runs fail."
+        exit 1
+      fi
+    done
+  done
+done
+set +x
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -16,21 +16,27 @@ limitations under the License. */
 * This file contains a simple demo for how to take a model for inference.
 */

+#include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <gtest/gtest.h>
 #include <memory>
-#include <thread>
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <thread>  //NOLINT
+#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");

 namespace paddle {
 namespace demo {

-DEFINE_string(dirname, "", "Directory of the inference model.");
-
 void Main(bool use_gpu) {
  //# 1. Create PaddlePredictor with a config.
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  if (FLAGS_dirname.empty()) {
+    LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
+    exit(1);
+  }
+  config.model_dir = FLAGS_dirname;
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
@@ -54,12 +60,16 @@ void Main(bool use_gpu) {
    CHECK(predictor->Run(slots, &outputs));

    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    PADDLE_ENFORCE(outputs.size(), 1UL);
+    // Check the output buffer size and result of each tid.
+    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                       0.000932706};
    const size_t num_elements = outputs.front().data.length() / sizeof(float);
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                     result[i]);
    }
  }
 }
@@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) {
  // Multi-threads only support on CPU
  // 0. Create PaddlePredictor with a config.
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname;
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
@@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) {
        CHECK(predictor->Run(inputs, &outputs));

        // 4. Get output.
-        ASSERT_EQ(outputs.size(), 1UL);
-        LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length();
+        PADDLE_ENFORCE(outputs.size(), 1UL);
+        // Check the output buffer size and result of each tid.
+        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                           0.000932706};
        const size_t num_elements =
            outputs.front().data.length() / sizeof(float);
        // The outputs' buffers are in CPU memory.
        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                         result[i]);
        }
      }
    });
@@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) {
  }
 }

-TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
-#endif
-
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  paddle::demo::MainThreads(1, false /* use_gpu*/);
+  paddle::demo::MainThreads(4, false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+    paddle::demo::MainThreads(1, true /*use_gpu*/);
+    paddle::demo::MainThreads(4, true /*use_gpu*/);
+  }
+  return 0;
+}
--- a/paddle/contrib/inference/demo/utils.h
+++ b/paddle/contrib/inference/demo/utils.h
@@ -13,16 +13,15 @@
 // limitations under the License.

 #pragma once
+#include <algorithm>
 #include <string>
 #include <vector>
-
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/paddle_inference_api.h"

 namespace paddle {
 namespace demo {

-static void split(const std::string& str,
-                  char sep,
+static void split(const std::string& str, char sep,
                  std::vector<std::string>* pieces) {
  pieces->clear();
  if (str.empty()) {

--- a/paddle/contrib/inference/demo/vis_demo.cc
+++ b/paddle/contrib/inference/demo/vis_demo.cc
@@ -18,26 +18,24 @@ limitations under the License. */

 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
-#include "paddle/contrib/inference/demo/utils.h"
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "utils.h"

 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 #endif
-
-namespace paddle {
-namespace demo {
-
 DEFINE_string(modeldir, "", "Directory of the inference model.");
 DEFINE_string(refer, "", "path to reference result for comparison.");
 DEFINE_string(
-    data,
-    "",
+    data, "",
    "path of data; each line is a record, format is "
    "'<space splitted floats as data>\t<space splitted ints as shape'");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
+
+namespace paddle {
+namespace demo {

 struct Record {
  std::vector<float> data;
@@ -47,7 +45,7 @@ struct Record {
 void split(const std::string& str, char sep, std::vector<std::string>* pieces);

 Record ProcessALine(const std::string& line) {
-  LOG(INFO) << "process a line";
+  VLOG(3) << "process a line";
  std::vector<std::string> columns;
  split(line, '\t', &columns);
  CHECK_EQ(columns.size(), 2UL)
@@ -65,8 +63,8 @@ Record ProcessALine(const std::string& line) {
  for (auto& s : shape_strs) {
    record.shape.push_back(std::stoi(s));
  }
-  LOG(INFO) << "data size " << record.data.size();
-  LOG(INFO) << "data shape size " << record.shape.size();
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
  return record;
 }

@@ -78,20 +76,22 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
  file.close();

  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  LOG(INFO) << "predictor output numel " << numel;
-  LOG(INFO) << "reference output numel " << refer.data.size();
-  EXPECT_EQ(numel, refer.data.size());
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
+  PADDLE_ENFORCE_EQ(numel, refer.data.size());
  switch (output.dtype) {
    case PaddleDType::INT64: {
      for (size_t i = 0; i < numel; ++i) {
-        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+        PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
+                          refer.data[i]);
      }
      break;
    }
    case PaddleDType::FLOAT32:
      for (size_t i = 0; i < numel; ++i) {
-        EXPECT_NEAR(
-            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+        PADDLE_ENFORCE_LT(
+            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
+            1e-5);
      }
      break;
  }
@@ -106,15 +106,15 @@ void Main(bool use_gpu) {
  config.prog_file = FLAGS_modeldir + "/__model__";
  config.use_gpu = use_gpu;
  config.device = 0;
-#ifdef PADDLE_WITH_CUDA
-  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
-#endif
+  if (FLAGS_use_gpu) {
+    config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  }

-  LOG(INFO) << "init predictor";
+  VLOG(3) << "init predictor";
  auto predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);

-  LOG(INFO) << "begin to process data";
+  VLOG(3) << "begin to process data";
  // Just a single batch of data.
  std::string line;
  std::ifstream file(FLAGS_data);
@@ -129,21 +129,26 @@ void Main(bool use_gpu) {
      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
      .dtype = PaddleDType::FLOAT32};

-  LOG(INFO) << "run executor";
+  VLOG(3) << "run executor";
  std::vector<PaddleTensor> output;
  predictor->Run({input}, &output);

-  LOG(INFO) << "output.size " << output.size();
+  VLOG(3) << "output.size " << output.size();
  auto& tensor = output.front();
-  LOG(INFO) << "output: " << SummaryTensor(tensor);
+  VLOG(3) << "output: " << SummaryTensor(tensor);

  // compare with reference result
  CheckOutput(FLAGS_refer, tensor);
 }

-TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
-#endif
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+  }
+  return 0;
+}
--- a/paddle/contrib/inference/high_level_api.md
+++ b/paddle/contrib/inference/high_level_api.md
--- a/paddle/contrib/inference/high_level_api_cn.md
+++ b/paddle/contrib/inference/high_level_api_cn.md
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
--- a/paddle/contrib/inference/test_paddle_inference_api.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/contrib/inference/paddle_inference_api.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"

 namespace paddle {


--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>

-#include <thread>
+#include <thread>  // NOLINT

 #include "gflags/gflags.h"
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -121,8 +121,8 @@ void MainImageClassification(bool use_gpu) {
  // which should be in the range [0.0, 1.0].
  feed_target_shapes[0][0] = batch_size;
  framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
-  SetupTensor<float>(
-      &input, input_dims, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&input, input_dims, static_cast<float>(0),
+                     static_cast<float>(1));
  std::vector<framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);


--- a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -15,7 +15,7 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"

 namespace paddle {

@@ -61,4 +61,4 @@ void Main(bool use_gpu) {

 TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }

-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
--- a/paddle/fluid/inference/paddle_fluid.sym
+++ b/paddle/fluid/inference/paddle_fluid.sym
+*paddle*
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -259,12 +259,15 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(unsqueeze_op DEPS reshape_op)
+op_library(squeeze_op DEPS reshape_op)

 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -35,7 +35,14 @@ class AucOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(inference_height, label_height,
                      "Out and Label should have same height.");

+    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+
    ctx->SetOutputDim("AUC", {1});
+    ctx->SetOutputDim("TPOut", {num_thres});
+    ctx->SetOutputDim("TNOut", {num_thres});
+    ctx->SetOutputDim("FPOut", {num_thres});
+    ctx->SetOutputDim("FNOut", {num_thres});
+
    ctx->ShareLoD("Out", /*->*/ "AUC");
  }

@@ -63,10 +70,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data."
             "The height is batch size and width is always 1.");
+    AddInput("TP", "True-Positive value.");
+    AddInput("FP", "False-Positive value.");
+    AddInput("TN", "True-Negative value.");
+    AddInput("FN", "False-Negative value.");
    // TODO(typhoonzero): support weight input
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
+    AddOutput("TPOut", "True-Positive value.");
+    AddOutput("FPOut", "False-Positive value.");
+    AddOutput("TNOut", "True-Negative value.");
+    AddOutput("FNOut", "False-Negative value.");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -34,6 +34,12 @@ class AucKernel : public framework::OpKernel<T> {
    auto* inference = ctx.Input<Tensor>("Out");
    auto* label = ctx.Input<Tensor>("Label");
    auto* auc = ctx.Output<Tensor>("AUC");
+    // Only use output var for now, make sure it's persistable and
+    // not cleaned up for each batch.
+    auto* true_positive = ctx.Output<Tensor>("TPOut");
+    auto* false_positive = ctx.Output<Tensor>("FPOut");
+    auto* true_negative = ctx.Output<Tensor>("TNOut");
+    auto* false_negative = ctx.Output<Tensor>("FNOut");

    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());

@@ -54,19 +60,10 @@ class AucKernel : public framework::OpKernel<T> {
    const T* inference_data = inference->data<T>();
    const int64_t* label_data = label->data<int64_t>();

-    // Create local tensor for storing the curve: TP, FN, TN, FP
-    // TODO(typhoonzero): use eigen op to caculate these values.
-    Tensor true_positive, false_positive, true_negative, false_negative;
-
-    true_positive.Resize({num_thresholds});
-    false_negative.Resize({num_thresholds});
-    true_negative.Resize({num_thresholds});
-    false_positive.Resize({num_thresholds});
-
-    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
+    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
+    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
+    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
+    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());

    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
      // caculate TP, FN, TN, FP for current thresh
@@ -91,10 +88,10 @@ class AucKernel : public framework::OpKernel<T> {
        }
      }
      // store rates
-      tp_data[idx_thresh] = tp;
-      fn_data[idx_thresh] = fn;
-      tn_data[idx_thresh] = tn;
-      fp_data[idx_thresh] = fp;
+      tp_data[idx_thresh] += tp;
+      fn_data[idx_thresh] += fn;
+      tn_data[idx_thresh] += tn;
+      fp_data[idx_thresh] += fp;
    }
    // epsilon to avoid divide by zero.
    float epsilon = 1e-6;

--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase {
      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
              << " and dir:" << dir << " to " << epmap[i];
    }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
  }
 };


--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(float) "
                   "Prior boxes center offset.")
        .SetDefault(0.5);
+    AddAttr<bool>(
+        "min_max_aspect_ratios_order",
+        "(bool) If set True, the output prior box is in order of"
+        "[min, max, aspect_ratios], which is consistent with Caffe."
+        "Please note, this order affects the weights order of convolution layer"
+        "followed by and does not affect the final detection results.")
+        .SetDefault(false);
    AddComment(R"DOC(
 Prior box operator
 Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.

--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
                            const int im_width, const int as_num,
                            const T offset, const T step_width,
                            const T step_height, const T* min_sizes,
-                            const T* max_sizes, const int min_num,
-                            bool is_clip) {
+                            const T* max_sizes, const int min_num, bool is_clip,
+                            bool min_max_aspect_ratios_order) {
  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
  int box_num = height * width * num_priors;
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
@@ -44,14 +44,28 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
    T min_size = min_sizes[m];
    if (max_sizes) {
      int s = p % (as_num + 1);
-      if (s < as_num) {
-        T ar = aspect_ratios[s];
-        bw = min_size * sqrt(ar) / 2.;
-        bh = min_size / sqrt(ar) / 2.;
+      if (!min_max_aspect_ratios_order) {
+        if (s < as_num) {
+          T ar = aspect_ratios[s];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        } else {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        }
      } else {
-        T max_size = max_sizes[m];
-        bw = sqrt(min_size * max_size) / 2.;
-        bh = bw;
+        if (s == 0) {
+          bw = bh = min_size / 2.;
+        } else if (s == 1) {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        } else {
+          T ar = aspect_ratios[s - 1];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        }
      }
    } else {
      int s = p % as_num;
@@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
    auto variances = ctx.Attr<std::vector<float>>("variances");
    auto flip = ctx.Attr<bool>("flip");
    auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");

    std::vector<float> aspect_ratios;
    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
    GenPriorBox<T><<<grid, block, 0, stream>>>(
        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
-        max_data, min_num, clip);
+        max_data, min_num, clip, min_max_aspect_ratios_order);

    framework::Tensor v;
    framework::TensorFromVector(variances, ctx.device_context(), &v);

--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    auto variances = ctx.Attr<std::vector<float>>("variances");
    auto flip = ctx.Attr<bool>("flip");
    auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");

    std::vector<float> aspect_ratios;
    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -108,26 +110,59 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
        int idx = 0;
        for (size_t s = 0; s < min_sizes.size(); ++s) {
          auto min_size = min_sizes[s];
-          // priors with different aspect ratios
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
-          }
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
+          if (min_max_aspect_ratios_order) {
+            box_width = box_height = min_size / 2.;
            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
            idx++;
+            if (max_sizes.size() > 0) {
+              auto max_size = max_sizes[s];
+              // square prior with size sqrt(minSize * maxSize)
+              box_width = box_height = sqrt(min_size * max_size) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+            // priors with different aspect ratios
+            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+              float ar = aspect_ratios[r];
+              if (fabs(ar - 1.) < 1e-6) {
+                continue;
+              }
+              box_width = min_size * sqrt(ar) / 2.;
+              box_height = min_size / sqrt(ar) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+          } else {
+            // priors with different aspect ratios
+            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+              float ar = aspect_ratios[r];
+              box_width = min_size * sqrt(ar) / 2.;
+              box_height = min_size / sqrt(ar) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+            if (max_sizes.size() > 0) {
+              auto max_size = max_sizes[s];
+              // square prior with size sqrt(minSize * maxSize)
+              box_width = box_height = sqrt(min_size * max_size) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
          }
        }
      }

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                         std::minstd_rand engine,
                         std::vector<int>* inds) const {
    std::uniform_real_distribution<float> uniform(0, 1);
-    if (inds->size() > num) {
-      for (int i = num; i < inds->size(); ++i) {
+    const int64_t size = static_cast<int64_t>(inds->size());
+    if (size > num) {
+      for (int64_t i = num; i < size; ++i) {
        int rng_ind = std::floor(uniform(engine) * i);
        if (rng_ind < num)
          std::iter_swap(inds->begin() + rng_ind + offset,

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
  req_count_++;
 }

-void GRPCClient::Wait() {
+bool GRPCClient::Wait() {
  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
+  return ok_;
 }

 void GRPCClient::Proceed() {
@@ -297,6 +298,14 @@ void GRPCClient::Proceed() {
    if (c->status_.ok()) {
      VLOG(3) << c->var_h_.String() << " process";
      c->Process();
+    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
+      LOG(ERROR) << c->var_h_.String()
+                 << " meets grpc error:" << c->status_.error_message();
+      {
+        std::lock_guard<std::mutex> lk(sync_mutex_);
+        ok_ = false;
+      }
+      sync_cond_.notify_all();
    } else {
      LOG(FATAL) << c->var_h_.String()
                 << " meets grpc error:" << c->status_.error_message();

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {

 class GRPCClient : public RPCClient {
 public:
-  GRPCClient() {}
+  GRPCClient() : ok_(true) {}
  virtual ~GRPCClient();

  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
@@ -221,7 +221,7 @@ class GRPCClient : public RPCClient {
  void AsyncSendEndPass(const std::string& ep,
                        int64_t time_out = FLAGS_rpc_deadline) override;

-  void Wait() override;
+  bool Wait() override;

  void SendBeginPass() override;

@@ -247,6 +247,7 @@ class GRPCClient : public RPCClient {
  std::mutex sync_mutex_;
  std::condition_variable sync_cond_;
  std::atomic<int64_t> req_count_{0};
+  bool ok_;

  // mutex for GetChannel thread safety
  std::mutex chan_mutex_;

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -72,7 +72,7 @@ class RPCClient {
  virtual void SendBeginPass() = 0;
  virtual void SendEndPass() = 0;

-  virtual void Wait() = 0;
+  virtual bool Wait() = 0;

  template <typename T>
  static RPCClient* GetInstance() {

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FakeQuantizeOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"),
+                   "OutMovingScale(Out) of FakeQuantizeOp should not be null");
+    // if (ctx->HasInput("InMovingScale")) {
+    ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
+    //}
+    // if (ctx->HasInput("InScales")) {
+    PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
+                   "OutScales(Out) of FakeQuantizeOp should not be null");
+    ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
+    // PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
+    // ctx->Outputs("OutScales")[0],
+    //                  "Mean and MeanOut should share the same memory");
+    //}
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddInput("InScales", "(Tensor) scale buffer, used in static quantization.")
+        .AsDispensable();
+    AddInput("InMovingScale", "Last scale, used in static quantization.")
+        .AsDispensable();
+    AddInput("InCurrentIter",
+             "Last iteration number, used in static quantization.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScales",
+              "(Tensor) scale buffer, used in static quantization.")
+        .AsDispensable();
+    AddOutput("OutMovingScale", " Current scale");
+    AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
+    AddAttr<std::string>("quantize_type",
+                         "(string, default abs_max)"
+                         "The scaling tpe of the quantize operator.")
+        .SetDefault("abs_max");
+    AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator
+
+quantize_type = abs_max:
+
+    $$scale = max(abs(x))$$ 
+
+quantize_type = range_abs_max:
+
+    $$scale = max(max(abs(x)), history_abs_max)$$ 
+
+quantize_type = moving_average_abs_max:
+
+    $$scale = 0.1*scale+0.9*new_abs_max)$$ 
+
+$$Out = scale*X$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fake_quantize,
+    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  extern __shared__ T shared_max_data[];
+  if (gridDim.x > 1) {
+    shared_max_data[tid] = T(0);
+    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+      T tmp = fabs(in[i]);
+      if (tmp > shared_max_data[tid]) {
+        shared_max_data[tid] = tmp;
+      }
+    }
+  } else {
+    if (bid < n) {
+      shared_max_data[tid] = fabs(in[bid]);
+    } else {
+      shared_max_data[tid] = T(0);
+    }
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+
+float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array,
+                    int length) {
+  float host_max;
+  int kNumTheads = 1024;
+  int gridDimx = (kNumTheads - 1 + length) / kNumTheads;
+  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
+  framework::Tensor t;
+  float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
+                                            platform::CUDAPlace());
+  FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float),
+                            ctx.stream()>>>(length, array, device_max);
+  FindAbsMaxKernel<
+      float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>(
+      gridDimx, device_max, device_max);
+  PADDLE_ENFORCE_EQ(
+      cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost),
+      cudaSuccess, "cudaMemcpy failed");
+  return host_max;
+}
+
+template <typename T>
+__global__ void ApplySaturateKernel(const int n, const T* in, T* out,
+                                    int* num_saturate, const T min,
+                                    const T max) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  extern __shared__ int shared_count[];
+  shared_count[tid] = 0;
+  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+    if (in[i] > max) {
+      out[i] = max;
+      shared_count[tid] += 1;
+    } else if (in[i] < min) {
+      out[i] = min;
+      shared_count[tid] += 1;
+    } else {
+      out[i] = in[i];
+    }
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i) {
+      shared_count[tid] += shared_count[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    num_saturate[blockIdx.x] = shared_count[0];
+  }
+}
+
+template <typename T>
+__global__ void ReduceKernel(const int n, const T* in, T* out) {
+  int tid = threadIdx.x;
+  extern __shared__ T shared_sum[];
+  if (tid < n) {
+    shared_sum[tid] = in[tid];
+  } else {
+    shared_sum[tid] = T(0);
+  }
+  __syncthreads();
+  // blockDim.x must >= n
+  for (int i = (n + 1) / 2; i > 0; i >>= 1) {
+    if (tid < i) {
+      shared_sum[tid] += shared_sum[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[0] = shared_sum[0];
+  }
+}
+
+template <typename T>
+int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n,
+                     const T* in, T* out, const T min, const T max) {
+  int host_num_saturate;
+  int kNumTheads = 1024;
+  int gridDimx = (n + kNumTheads - 1) / kNumTheads;
+  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
+  framework::Tensor t;
+  int* device_num_saturate = t.mutable_data<int>(
+      framework::make_ddim({gridDimx}), platform::CUDAPlace());
+  ApplySaturateKernel<
+      T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
+      n, in, out, device_num_saturate, min, max);
+  ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
+      gridDimx, device_num_saturate, device_num_saturate);
+  PADDLE_ENFORCE_EQ(cudaSuccess,
+                    cudaMemcpy(&host_num_saturate, device_num_saturate,
+                               sizeof(int), cudaMemcpyDeviceToHost),
+                    "cudaMemcpy failed");
+  return host_num_saturate;
+}
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeCUDAKernel : public framework::OpKernel<T> {
+ public:
+  T FindRangeAbsMax(const platform::CUDADeviceContext& ctx,
+                    framework::Tensor* scale_list, framework::Tensor* out_scale,
+                    const T& cur_scale, int window_size,
+                    int current_iter) const {
+    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
+    T remove_tmp = sl[current_iter];
+    sl[current_iter] = cur_scale;
+    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
+    if (max_scale < cur_scale) {
+      max_scale = cur_scale;
+    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
+      int size = (current_iter > window_size) ? window_size : current_iter;
+      max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
+    }
+    return max_scale;
+  }
+
+  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
+                             framework::Tensor* out_scale,
+                             const T& cur_scale) const {
+    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
+    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
+    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
+    return T(outs[0]);
+  }
+
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto& device_ctx = context.cuda_device_context();
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    const bool is_test = context.Attr<bool>("is_test");
+    tensor->mutable_data<T>(in->place());
+    context.Output<framework::Tensor>("OutMovingScale")
+        ->mutable_data<T>(
+            context.Input<framework::Tensor>("InMovingScale")->place());
+    auto quantize_type =
+        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
+    if (quantize_type == std::string("range_abs_max")) {
+      context.Output<framework::Tensor>("OutScales")
+          ->mutable_data<T>(
+              context.Input<framework::Tensor>("InScales")->place());
+      context.Output<framework::Tensor>("OutCurrentIter")
+          ->mutable_data<T>(
+              context.Input<framework::Tensor>("InCurrentIter")->place());
+    }
+
+    T scale = T(1);
+    int window_size = context.Attr<int>("window_size");
+    T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
+    if (quantize_type == std::string("abs_max")) {
+      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
+      scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+      saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      auto* scale_list = context.Output<framework::Tensor>("OutScales");
+      math::SetConstant<DeviceContext, T> scalar;
+      scale_list->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, scale_list, static_cast<T>(0));
+      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+      iter->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, iter, static_cast<T>(0));
+    } else if (quantize_type == std::string("range_abs_max")) {
+      auto* moving_scale = const_cast<framework::Tensor*>(
+          context.Input<framework::Tensor>("InMovingScale"));
+      if (is_test) {
+        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
+      } else {
+        auto* it = const_cast<framework::Tensor*>(
+            context.Input<framework::Tensor>("InCurrentIter"));
+        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+        int* last_iter = it->mutable_data<int>(platform::CPUPlace());
+        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
+        auto* scale_list = context.Output<framework::Tensor>("OutScales");
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+        scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
+                                window_size, current_iter[0]);
+        (*current_iter) = (*last_iter) + 1;
+      }
+    } else if (quantize_type == std::string("moving_average_abs_max")) {
+      auto* moving_scale = const_cast<framework::Tensor*>(
+          context.Input<framework::Tensor>("InMovingScale"));
+      if (is_test) {
+        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
+      } else {
+        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        scale = FindMovingAverageAbsMmax(
+            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
+      }
+    }
+
+    ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(),
+                        tensor->mutable_data<T>(in->place()), -scale, scale);
+    scale = bin_cnt / scale;
+
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
+    eigen_out.device(dev) = (scale * eigen_in).round();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(fake_quantize,
+                        paddle::operators::FakeQuantizeCUDAKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        paddle::operators::FakeQuantizeCUDAKernel<
+                            paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeKernel : public framework::OpKernel<T> {
+ public:
+  T FindAbsMax(framework::Tensor* in, int n) const {
+    T* p = in->mutable_data<T>(platform::CPUPlace());
+    T abs_max = (T)0.00000001;
+    for (int i = 0; i < n; i++) {
+      T tmp = fabs(p[i]);
+      if (tmp > abs_max) abs_max = tmp;
+    }
+    return T(abs_max);
+  }
+  T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
+                    const T& cur_scale, int window_size,
+                    int current_iter) const {
+    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
+    T remove_tmp = sl[current_iter];
+    sl[current_iter] = cur_scale;
+    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
+    if (max_scale < cur_scale) {
+      max_scale = cur_scale;
+    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
+      int size = (current_iter > window_size) ? window_size : current_iter;
+      max_scale = T(FindAbsMax(scale_list, size));
+    }
+    return max_scale;
+  }
+
+  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
+                             framework::Tensor* out_scale,
+                             const T& cur_scale) const {
+    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
+    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
+    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
+    return T(outs[0]);
+  }
+
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    const bool is_test = context.Attr<bool>("is_test");
+    tensor->mutable_data<T>(in->place());
+
+    auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
+    oms_tensor->mutable_data<T>(in->place());
+
+    auto quantize_type =
+        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
+    if (quantize_type == std::string("range_abs_max")) {
+      auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
+      oss_tensor->mutable_data<T>(
+          context.Input<framework::Tensor>("InScales")->place());
+      auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
+      oci_tensor->mutable_data<T>(
+          context.Input<framework::Tensor>("InCurrentIter")->place());
+    }
+
+    T scale = static_cast<T>(1);
+    int window_size = context.Attr<int>("window_size");
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto raw_in = framework::EigenVector<T>::Flatten(*in);
+    if (quantize_type == std::string("abs_max")) {
+      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
+      auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+      scale_out.device(dev) = raw_in.abs().maximum();
+      scale = scale_out(0);
+
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      auto* scale_list = context.Output<framework::Tensor>("OutScales");
+      math::SetConstant<DeviceContext, T> scalar;
+      scale_list->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, scale_list, static_cast<T>(0));
+      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+      iter->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, iter, static_cast<T>(0));
+    } else if (quantize_type == std::string("range_abs_max")) {
+      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
+      if (is_test) {
+        scale = moving_scale->data<T>()[0];
+      } else {
+        auto* it = context.Input<framework::Tensor>("InCurrentIter");
+        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+        const int* last_iter = it->data<int>();
+        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
+        auto* scale_list = context.Output<framework::Tensor>("OutScales");
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+        scale_out.device(dev) = raw_in.abs().maximum();
+        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
+        scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
+                                current_iter[0]);
+        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+        (*current_iter) = (*last_iter) + 1;
+      }
+    } else if (quantize_type == std::string("moving_average_abs_max")) {
+      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
+      if (is_test) {
+        scale = moving_scale->data<T>()[0];
+      } else {
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+        scale_out.device(dev) = raw_in.abs().maximum();
+        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
+        scale = FindMovingAverageAbsMmax(
+            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
+        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+      }
+    }
+
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), in->data<T>(),
+          in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()),
+          ClipFunctor<T>(-scale, scale));
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
+    eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -45,13 +45,13 @@ class FetchBarrierOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");

    for (auto& ep : eps) {
      VLOG(3) << "fetch barrier, ep: " << ep;
      rpc_client->AsyncSendFetchBarrier(ep);
    }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
  }
 };


--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ *
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ *   |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ *
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ */
+
+class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
+                   "Output(PreOut) should not be null.");
+    const int64_t batch_size = ctx->GetInputDim("X")[0];
+    std::vector<int64_t> output_shape({batch_size, 1});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+template <typename AttrType>
+class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, required) The input tensor with shape [N, D], "
+             "where N is the size of mini-batch, and D is the feature size.");
+    AddInput("W",
+             "(Tensor, required), The parameters of hierarchical "
+             "sigmoid operator, each of them is a 2-D tensor, the shape is"
+             "[num_classes - 1, D].");
+    AddInput("Label",
+             "(Tensor, required), The labels of training data. It's a"
+             "tensor with shape [N, 1].");
+    AddInput("Bias",
+             "(Tensor, optional), The bias is a tensor with shape"
+             "[1, num_classes - 1].");
+    AddOutput("Out",
+              "(Tensor, required) The output of hierarchical sigmoid operator."
+              "The shape is [N, 1].");
+    AddOutput("PreOut",
+              "(Tensor, required) A intermedia 2-D tensor with shape "
+              "[batch_size, code_length], where code_length represents the "
+              "maximum path length from root to leaf nodes.")
+        .AsIntermediate();
+    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+        .SetDefault(2);
+    AddComment(R"DOC(
+The hierarchical sigmoid operator organize the classes into a binary tree.
+At each node, a sigmoid function is used to calculate the probability of
+belonging to the right branch. This idea is from
+"F. Morin, Y. Bengio (AISTATS 05):
+Hierarchical Probabilistic Neural Network Language Model."
+      )DOC");
+  }
+};
+
+class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PreOut"),
+                   "Input(Preout) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
+                   "Output(W@Grad should not be null.)");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid_grad,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    int64_t code_length = math::FindLastSet(num_classes - 1);
+    int64_t batch_size = in->dims()[0];
+    framework::Tensor sum;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* pre_out_data = pre_out->mutable_data<T>(
+        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+    // 0s can avoid out of path's loss.
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, pre_out, static_cast<T>(0.0));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::RowwiseSum<DeviceContext, T> row_sum;
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    std::vector<int64_t> sum_dims({batch_size, 1UL});
+    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
+    auto sum_mat = EigenMatrix<T>::From(sum);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_mat = framework::EigenVector<T>::Flatten(*out);
+    if (bias) {
+      bit_code.Add(pre_out, *bias);
+    }
+    bit_code.Mul(pre_out, *w, *in);
+    // clip to [-40, 40]
+    Transform<DeviceContext> trans;
+    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
+          pre_out_data + pre_out->numel(), pre_out_data,
+          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
+    bit_code.Sum(*pre_out, out, static_cast<T>(-1));
+    // use softrelu to calculate cross entropy
+    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+    row_sum(dev_ctx, *pre_out, &sum);
+    // TODO(guosheng): Subtract the out of path's loss, since not all
+    // class(leaf) nodes' path lengths equal code_length. But it won't break the
+    // gradient check since both have the out of path's loss and will cancel out
+    // each other.
+    out_mat.device(place) = sum_mat + out_mat;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
+    auto* bias_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor pre_out_grad;
+
+    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    w_grad->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, w_grad, static_cast<T>(0.0));
+
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+
+    // softrelu derivative
+    pre_out_grad_mat.device(place) =
+        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+    pre_out_grad_mat.device(place) =
+        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+    // be consistent with the clipping in forward.
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+      bit_code.AddGrad(pre_out_grad, bias_grad);
+    }
+    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
+    bit_code.MulGradError(pre_out_grad, *w, in_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <string>
 #include <vector>

 namespace paddle {
@@ -28,27 +29,18 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
                   "Input(X) of Im2SequenceOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of Im2SequenceOp op should not be null.");
-
    auto in_dim = ctx->GetInputDim("X");
+
    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                      "Input(X) format must be 4D tensor, eg., NCHW.");
+    int img_channels = in_dim[1];

    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");

-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
-                              img_channels * kernels[0] * kernels[1]});
+    ctx->SetOutputDim("Out",
+                      {in_dim[0], img_channels * kernels[0] * kernels[1]});
  }
 };

@@ -61,6 +53,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
             "C: channels"
             "H: height"
             "W: width");
+    AddInput("Y",
+             "(Tensor) The input tensor of image real size(H, W)."
+             "2-D with shape [batchsize, 2]")
+        .AsDispensable();
    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
    AddAttr<std::vector<int>>("kernels",
                              "(vector<int>), the "
@@ -73,6 +69,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
                              "(vector<int> default:{0, 0, 0, 0}), the "
                              "paddings(up_pad, left_pad, down_pad, right_pad)")
        .SetDefault({0, 0, 0, 0});
+    AddAttr<std::vector<int>>("out_stride",
+                              "the attribute is valid only when input(Y)"
+                              "is not NULL.this attribute represents the"
+                              "scaling of the pic through the CNN"
+                              "(vector<int> dedault:{1,1}),the out_stride"
+                              " (out_stride_height, out_stride_width)")
+        .SetDefault({1, 1});
    AddComment(R"DOC(
 This op uses kernels to scan images and converts these images to sequences.
 After expanding, The number of time steps are output_height * output_width
@@ -123,7 +126,7 @@ output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
               [ 7.  1.  7.  9.  2.  1.  3.  5.]
               [ 5.  7.  2.  4.  1.  3.  9.  0.]
               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 9}
+output.dims = {8, 8}
 output.lod = [[0, 4, 8]]

 )DOC");

--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,6 +13,7 @@
   limitations under the License. */

 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -39,50 +40,107 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    const Tensor* in = ctx.Input<Tensor>("X");
    LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
-    // being available for python API
-    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
-    //                  "Input(X) layout must be NCHW");
    auto in_dim = in->dims();
    int batch_size = in_dim[0];
    int img_channels = in_dim[1];
    int img_height = in_dim[2];
    int img_width = in_dim[3];
-
    auto kernels = ctx.Attr<std::vector<int>>("kernels");
    auto strides = ctx.Attr<std::vector<int>>("strides");
    auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    const std::vector<int> dilations({1, 1});
-
-    auto out_dims = out->dims();
-    out->Resize({batch_size, out->numel() / batch_size});
-    for (int i = 0; i < batch_size; i++) {
-      const Tensor src =
-          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
-
-      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
-    }
-    out->Resize(out_dims);
-
-    // set lod information
-    // TODO(wanghaoshuang): Move this to InferShape
-    framework::LoD lod(1);
-    lod[0].reserve(batch_size + 1);
-    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+    if (ctx.HasInput("Y") && batch_size > 1) {
+      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
+      auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
+      Tensor cpu_shape_tensor;
+      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
+      std::vector<int> imgreal_h;
+      std::vector<int> imgreal_w;
+      std::vector<int> output_height;
+      std::vector<int> output_width;
+      int result = 0;
+      for (int i = 0; i < batch_size; i++) {
+        int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
+        int tmp_real_w =
+            static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
+        if (tmp_real_h % out_stride[0] == 0) {
+          tmp_real_h = tmp_real_h / out_stride[0];
+        } else {
+          tmp_real_h = tmp_real_h / out_stride[0] + 1;
+        }
+        if (tmp_real_w % out_stride[1] == 0) {
+          tmp_real_w = tmp_real_w / out_stride[1];
+        } else {
+          tmp_real_w = tmp_real_w / out_stride[1] + 1;
+        }
+        imgreal_h.push_back(tmp_real_h);
+        imgreal_w.push_back(tmp_real_w);
+        output_height.push_back(Im2SeqOutputSize(
+            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
+        output_width.push_back(Im2SeqOutputSize(
+            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
+        result += output_height[i] * output_width[i];
+      }
+
+      out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+
+      const std::vector<int> dilations({1, 1});
+      int offset_out = 0;
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst = out->Slice(offset_out,
+                                offset_out + output_height[i] * output_width[i])
+                         .Resize({output_height[i], output_width[i],
+                                  img_channels, kernels[0], kernels[1]});
+        offset_out += output_height[i] * output_width[i];
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
+      lod[0].push_back(offset);
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height[i] * output_width[i];
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
+    } else {
+      int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                           paddings[2], strides[0]);
+      int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                          paddings[3], strides[1]);
+      out->mutable_data<T>({batch_size * output_height * output_width,
+                            img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+      const std::vector<int> dilations({1, 1});
+      auto out_dims = out->dims();
+      out->Resize({batch_size, out->numel() / batch_size});
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst =
+            out->Slice(i, i + 1).Resize({output_height, output_width,
+                                         img_channels, kernels[0], kernels[1]});
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      out->Resize(out_dims);
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
      lod[0].push_back(offset);
-      offset += output_height * output_width;
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height * output_width;
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
    }
-    out->set_lod(lod);
  }
 };


--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -61,6 +61,8 @@ static void ParallelExecuteBlocks(
        framework::Async([&executor, &prepared, &program, &scope, idx]() {
          int run_block = idx;  // thread local
          try {
+            VLOG(3) << "running server block: " << run_block
+                    << "pointer: " << prepared[run_block].get();
            executor->RunPreparedContext(prepared[run_block].get(), scope);
          } catch (const std::exception &e) {
            LOG(ERROR) << "run sub program error " << e.what();
@@ -107,12 +109,14 @@ void ListenAndServOp::RunSyncLoop(
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");

-  std::vector<int> optimize_blocks_idx;
-  for (auto blk : optimize_blocks) {
-    optimize_blocks_idx.push_back(blk->ID());
+  // Prepare all the server block
+  std::vector<int> optimize_blocks_list;
+  for (size_t i = 1; i < program->Size(); ++i) {
+    optimize_blocks_list.push_back(i);
  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
-  // Insert placeholder for block0 which holds current op itself.
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
+  // Insert placeholder for block0 which holds current op itself,
+  // NOTE the first block in `optimize_prepared` should never be ran.
  optimize_prepared.insert(
      optimize_prepared.begin(),
      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -51,6 +51,7 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+math_library(matrix_bit_code)
 math_library(unpooling)
 math_library(vol2col)


--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -21,6 +21,10 @@
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif

+#ifdef PADDLE_WITH_LIBXSMM
+#include <libxsmm.h>
+#endif
+
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
 #endif

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <limits>
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"

@@ -30,6 +31,12 @@ struct CBlas<float> {
    platform::dynload::cblas_sgemm(args...);
  }

+#ifdef PADDLE_WITH_LIBXSMM
+  template <typename... ARGS>
+  static void SMM_GEMM(ARGS... args) {
+    libxsmm_sgemm(args...);
+  }
+#endif
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
    platform::dynload::cblas_saxpy(args...);
@@ -63,6 +70,12 @@ struct CBlas<double> {
    platform::dynload::cblas_dgemm(args...);
  }

+#ifdef PADDLE_WITH_LIBXSMM
+  template <typename... ARGS>
+  static void SMM_GEMM(ARGS... args) {
+    libxsmm_dgemm(args...);
+  }
+#endif
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
    platform::dynload::cblas_daxpy(args...);
@@ -140,6 +153,9 @@ struct CBlas<double> {
 template <>
 struct CBlas<platform::float16> {
  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void SMM_GEMM(...) {
+    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+  }
 #ifdef PADDLE_WITH_MKLML
  static void GEMM_BATCH(...) {
    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -147,6 +163,33 @@ struct CBlas<platform::float16> {
 #endif
 };

+template <typename T>
+inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
+                    bool transb, const T &alpha, const T &beta) {
+#ifdef PADDLE_WITH_LIBXSMM
+  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
+  // But the threshold is custom
+  constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
+  if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
+      std::abs<T>(alpha - static_cast<T>(1) >
+                  std::numeric_limits<T>::epsilon()) ||
+      std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
+    return false;
+  } else {
+    return true;
+  }
+#endif
+  return false;
+}
+
+template <>
+inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
+                                       bool transa, bool transb,
+                                       const platform::float16 &alpha,
+                                       const platform::float16 &beta) {
+  return false;
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -156,8 +199,21 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-                 beta, C, ldc);
+#ifdef PADDLE_WITH_LIBXSMM
+  if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
+              beta)) {
+    // Note: SMM use ColMajor
+    const char transa = 'N';
+    const char transb = 'N';
+    CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
+                       &beta, C, &ldc);
+  } else {
+#endif
+    CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B,
+                   ldb, beta, C, ldc);
+#ifdef PADDLE_WITH_LIBXSMM
+  }
+#endif
 }

 template <>

--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -43,21 +43,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
    int col_height = col->dims()[3];
    int col_width = col->dims()[4];

-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
    int channels_col = im_channels * filter_height * filter_width;

    const T* im_data = im.data<T>();
@@ -178,17 +163,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];

-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
    const T* im_data = im.data<T>();
    T* col_data = col->data<T>();


--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
    int col_height = col->dims()[3];
    int col_width = col->dims()[4];

-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
    int num_outputs = im_channels * col_height * col_width;
    int blocks = (num_outputs + 1024 - 1) / 1024;
    int block_x = 512;
@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];

-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
    int block_dim_x = 0;
    int block_dim_y = 0;
    if (filter_height <= 4 && filter_width <= 4) {

--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -155,7 +155,7 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(out->numel(), height);

    T* out_buf = out->mutable_data<T>(out->place());
    const T* in_buf = input.data<T>();

--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) {
  EXPECT_EQ(input3_ptr[6], 86);
  EXPECT_EQ(input3_ptr[7], 99);
 }
+#ifdef PADDLE_WITH_LIBXSMM
+template <typename T>
+void MklSmmCompare(int m, int n, int k) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor mat_b;
+  paddle::framework::Tensor mat_c_smm;
+  paddle::framework::Tensor mat_c_mkl;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
+  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
+  T* CSMM = mat_c_smm.mutable_data<T>({m, n}, *cpu_place);
+  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
+  T alpha = static_cast<T>(1);
+  T beta = static_cast<T>(0);
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    A[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < mat_b.numel(); ++i) {
+    B[i] = static_cast<T>(i);
+  }
+  // lda,ldb,ldc follow RowMajor
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+
+  auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
+    const char transa = 'N';
+    const char transb = 'N';
+    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa, &transb, &n, &m, &k,
+                                                &alpha, B, &ldb, A, &lda, &beta,
+                                                CSMM, &ldc);
+  };
+
+  auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
+    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
+                                            CblasNoTrans, m, n, k, alpha, A,
+                                            lda, B, ldb, beta, CMKL, ldc);
+  };
+
+  smm();
+  mkl();
+  ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel());
+  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
+    EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]);
+  }
+}
+TEST(math_function, gemm_mkl_vs_smm) {
+  MklSmmCompare<float>(1, 2, 3);
+  MklSmmCompare<double>(1, 2, 3);
+  MklSmmCompare<float>(3, 2, 1);
+  MklSmmCompare<double>(3, 2, 1);
+  MklSmmCompare<float>(3, 8, 5);
+  MklSmmCompare<double>(3, 8, 5);
+}
+#endif

-TEST(math_function, gemm_trans_clbas) {
+TEST(math_function, gemm_trans_cblas) {
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input2;
  paddle::framework::Tensor input3;

--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include <iostream>
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
+                                  const framework::Tensor& vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat->dims()[0];
+  size_t width = tmat->dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+                                      framework::Tensor* vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat.dims()[0];
+  size_t width = tmat.dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
+                                  framework::Tensor* sum, T scale_sum) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t o_width = tmat.dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    T sm = static_cast<T>(0.0);
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        // calc_bit starts from right most bit, while data in tmat[i] is in the
+        // reverse order.
+        sm += tmat.data<T>()[i * o_width + j];
+      }
+    }
+    sum->data<T>()[i] = scale_sum * sm;
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
+                                  const framework::Tensor& weight,
+                                  const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t tmat_width = tmat->dims()[1];
+  size_t input_width = input.dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat->data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      T sum = static_cast<T>(0.0);
+      for (size_t k = 0; k < input_width; ++k) {
+        sum += weight_value[weight_width * index + k] *
+               input_value[input_width * i + k];
+      }
+      tmat_value[i * tmat_width + j] += sum;
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+                                            framework::Tensor* weight,
+                                            const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t input_width = input.dims()[1];
+  size_t tmat_width = tmat.dims()[1];
+  size_t weight_width = weight->dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight->data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        weight_value[weight_width * index + k] +=
+            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
+                                           const framework::Tensor& weight,
+                                           framework::Tensor* input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t tmat_width = tmat.dims()[1];
+  size_t input_width = input->dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input->data<T>();
+
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        input_value[input_width * i + k] +=
+            tmat_value[i * tmat_width + j] *
+            weight_value[weight_width * index + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t o_width = tmat->dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        tmat->data<T>()[i * o_width + j] -= 1;
+      }
+    }
+  }
+}
+
+template class MatrixBitCodeFunctor<float>;
+template class MatrixBitCodeFunctor<double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+/**
+ * SimpleCodeTable class should support 3 functions:
+ *
+ * size_t size()
+ *   return the number of ids
+ *
+ * int get_max_code_length()
+ *   return the maximal code length
+ *
+ * SimpleCode operator()(size_t i)
+ *   return the i-th code. Code class is descriebed below.
+ *
+ * SimpleCode class should support 3 functions:
+ *
+ * int get_length()
+ *   return the length of the code
+ *
+ * size_t cal_index(int bit)
+ *   bit ranges from 0 to get_length() - 1
+ *   return the index for the (1+bit) level parent
+ *
+ * bool calc_bit(int bit)
+ *   return true if the bit level parent is the right child of (1+bit) level
+ *   parent
+ *
+ */
+
+/**
+ * return the 1-based index of the highest bit set
+ *
+ * for x > 0:
+ * \f[
+ *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
+ * \f]
+ */
+inline constexpr size_t FindLastSet(size_t x) {
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+}
+
+struct SimpleCode {
+  SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
+  /**
+   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * is `c + num_classes` and all siblings can get the same weight indice using
+   * prefixes.
+   * Weight index is the prefixes of encoding, thus leave out the right most
+   * bit in calc_index.
+   * Binary classification path is the suffixes of encoding, thus leave out the
+   * left most bit in calc_bit.
+   */
+  inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
+  inline int get_length() const { return FindLastSet(c_) - 1; }
+
+ private:
+  size_t c_;
+};
+
+struct SimpleCodeTable {
+  explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {}
+  SimpleCode operator()(size_t code) const {
+    return SimpleCode(code, num_classes_);
+  }
+  size_t size() const { return num_classes_; }
+  int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
+
+ private:
+  size_t num_classes_;
+};
+
+template <typename T>
+class MatrixBitCodeFunctor {
+ public:
+  explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
+      : num_classes_(num_classes), ids_(ids) {}
+  /* For j < code_length
+       tmat(i, j) += vec(0, index(i, j))
+  */
+  void Add(framework::Tensor* tmat, const framework::Tensor& vec);
+
+  /* For j < code_length
+       vec(0, index(i, j)) += tmat(i, j)
+  */
+  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
+
+  /* For j < code_length
+    sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
+  */
+  void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
+
+  /* For j < code_length
+       tmat(i, j) -= bit(i, j)
+  */
+  void Sub(framework::Tensor* tmat);
+  /* For j < code_length
+       input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void Mul(framework::Tensor* tmat, const framework::Tensor& weight,
+           const framework::Tensor& input);
+
+  /* For index(i, j) >= 0:
+      weight.row(index(i, j)) += tmat(i, j) * input.row(i)
+  */
+  void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
+                     const framework::Tensor& input);
+  /* For j < code_length
+    input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void MulGradError(const framework::Tensor& tmat,
+                    const framework::Tensor& weight, framework::Tensor* input);
+
+  size_t num_classes_;
+  const int64_t* ids_;
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
  }
 };


--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -81,6 +81,15 @@ class BlockingQueue {
    }
  }

+  void ReOpen() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    closed_ = false;
+    std::deque<T> new_deque;
+    queue_.swap(new_deque);
+    send_cv_.notify_all();
+    receive_cv_.notify_all();
+  }
+
  void Close() {
    std::lock_guard<std::mutex> lock(mutex_);
    closed_ = true;

--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -23,7 +23,7 @@ class BatchReader : public framework::DecoratedReader {
  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
              bool discard_leftover)
      : DecoratedReader(reader),
-        batch_size_(batch_size),
+        batch_size_(static_cast<size_t>(batch_size)),
        discard_leftover_(discard_leftover) {
    buffer_.reserve(batch_size_);
  }
@@ -31,7 +31,7 @@ class BatchReader : public framework::DecoratedReader {
  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;

 private:
-  int batch_size_;
+  size_t batch_size_;
  bool discard_leftover_;
  std::vector<std::vector<framework::LoDTensor>> buffer_;
 };
@@ -78,7 +78,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
 void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
  buffer_.clear();
  buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
+  for (size_t i = 0; i < batch_size_; ++i) {
    buffer_.push_back(std::vector<framework::LoDTensor>());
    reader_->ReadNext(&buffer_.back());
    if (buffer_.back().empty()) {
@@ -95,9 +95,9 @@ void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
    // if buffer_ is empty, the 'out' will return as an empty vector.
    return;
  }
-  int out_num = buffer_[0].size();
+  size_t out_num = buffer_[0].size();
  out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
+  for (size_t j = 0; j < out_num; ++j) {
    // Merge shape and check date type
    std::type_index batch_type = buffer_[0][j].type();
    framework::DDim batch_shape = buffer_[0][j].dims();

--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -27,19 +27,17 @@ class PyReader : public framework::FileReader {
    queue_ = queue;
  }

-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
    bool success;
    *out = queue_->Pop(&success);
    if (!success) out->clear();
  }

- private:
-  void ShutdownImpl() override { /* TODO */
-  }
+  void Shutdown() override { queue_->Close(); }

-  void StartImpl() override { /* TODO */
-  }
+  void Start() override { queue_->ReOpen(); }

+ private:
  std::shared_ptr<LoDTensorBlockingQueue> queue_;
 };


--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -58,12 +58,15 @@ class LoDTensorBlockingQueue {

  inline size_t Size() const { return queue_.Size(); }

-  inline void Close() { return queue_.Close(); }
+  inline void ReOpen() { queue_.ReOpen(); }
+
+  inline void Close() { queue_.Close(); }

  inline bool IsClosed() const { return queue_.IsClosed(); }

 private:
-  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+  void CheckDims(
+      const std::vector<framework::LoDTensor>& lod_tensor_vec) const {
    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
                   "Expect input size is %d but found %s", dims_.size(),
                   lod_tensor_vec.size());

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -51,7 +51,7 @@ class RecvOp : public framework::OperatorBase {
      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
    }
    if (sync_mode) {
-      rpc_client->Wait();
+      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
    }
  }
 };

--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -50,13 +50,13 @@ class SendBarrierOp : public framework::OperatorBase {
    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;

    // need to wait before sending send_barrier message
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
    if (sync_mode) {
      for (auto& ep : eps) {
        VLOG(3) << "send barrier, ep: " << ep;
        rpc_client->AsyncSendBatchBarrier(ep);
      }
-      rpc_client->Wait();
+      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
    }
  }
 };

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase {
      }
    }
    if (sync_send) {
-      rpc_client->Wait();
+      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
    }
  }
 };

--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -88,7 +88,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        input_format = memory::format::nc;
      }

-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (int i = 0; i < N; i++) {
        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
                       "all inputs must be all LoDTensors");
        auto& input = in_vars[i]->Get<LoDTensor>();

--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel<T> {
 #endif
    for (size_t i = 0; i < row; i++) {
      std::vector<std::pair<T, size_t>> vec;
+      vec.reserve(col);
      for (size_t j = 0; j < col; j++) {
        vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
      }

--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -46,7 +46,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
-    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)

 cc_test(init_test SRCS init_test.cc DEPS device_context)

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -222,15 +222,16 @@ class MKLDNNHandler {

  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                             const std::string& suffix) {
-    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
-      std::string dstr = "";
-      for (size_t i = 0; i < operand_dims.size(); ++i) {
-        dstr += std::to_string(operand_dims[i]) + "-";
-      }
-      return dstr;
-    };
-
    return dims2str(operand_dims) + suffix;
+  };
+
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
  }

 protected:

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -145,14 +145,14 @@ void BindBlockDesc(pybind11::module *m) {
      .def_property_readonly("id", &pd::BlockDesc::ID)
      .def_property_readonly("parent", &pd::BlockDesc::Parent)
      .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
-      .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
+      .def("_set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
      .def("append_op", &pd::BlockDesc::AppendOp,
           pybind11::return_value_policy::reference)
-      .def("prepend_op", &pd::BlockDesc::PrependOp,
+      .def("_prepend_op", &pd::BlockDesc::PrependOp,
           pybind11::return_value_policy::reference)
-      .def("insert_op", &pd::BlockDesc::InsertOp,
+      .def("_insert_op", &pd::BlockDesc::InsertOp,
           pybind11::return_value_policy::reference)
-      .def("remove_op", &pd::BlockDesc::RemoveOp)
+      .def("_remove_op", &pd::BlockDesc::RemoveOp)
      .def("var",
           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
             std::string name = byte_name;
@@ -165,7 +165,7 @@ void BindBlockDesc(pybind11::module *m) {
             return self.HasVar(name);
           },
           pybind11::return_value_policy::reference)
-      .def("rename_var",
+      .def("_rename_var",
           [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
              const pybind11::bytes &byte_name_new) {
             std::string name = byte_name;
@@ -189,7 +189,7 @@ void BindBlockDesc(pybind11::module *m) {
             return self.FindVarRecursive(name);
           },
           pybind11::return_value_policy::reference)
-      .def("remove_var",
+      .def("_remove_var",
           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
             std::string name = byte_name;
             return self.RemoveVar(name);

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/legacy/capi/Arguments.cpp
+++ b/paddle/legacy/capi/Arguments.cpp
@@ -66,6 +66,17 @@ paddle_error paddle_arguments_get_value(paddle_arguments args,
  return kPD_NO_ERROR;
 }

+PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
+                                              uint64_t ID,
+                                              paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  m->mat = a->args[ID].in;
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_arguments_get_ids(paddle_arguments args,
                                      uint64_t ID,
                                      paddle_ivector ids) {

--- a/paddle/legacy/capi/arguments.h
+++ b/paddle/legacy/capi/arguments.h
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
+++ b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
+++ b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
--- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
+++ b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
--- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
+++ b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
+++ b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
+++ b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
--- a/paddle/legacy/utils/PythonUtil.cpp
+++ b/paddle/legacy/utils/PythonUtil.cpp
--- a/paddle/legacy/utils/PythonUtil.h
+++ b/paddle/legacy/utils/PythonUtil.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/check_pr_approval.py
+++ b/tools/check_pr_approval.py