diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb796103350ac4403d4151cf08eb4315bcde68fd..b1554fba5e1fa48b5cbdfe2e5b9f317a4f7fefb3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,8 +63,29 @@ if(WIN32)
         set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
         set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
         set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+        foreach(flag_var
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+            if(${flag_var} MATCHES "/MD")
+                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+            endif()
+        endforeach(flag_var)
     endif()
-    
+
+    # windows build turn off warnings.
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
+    endforeach(flag_var)
+
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     message(STATUS "Using parallel compiling (/MP)")
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index af5dd0e2c9b2d19929f58363d08e7ff40d43b013..351ef1c7c7aebb698a5d41689352a913d0b950e8 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-  set(CompilerFlags
-        CMAKE_CXX_FLAGS
-        CMAKE_CXX_FLAGS_DEBUG
-        CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_C_FLAGS
-        CMAKE_C_FLAGS_DEBUG
-        CMAKE_C_FLAGS_RELEASE
-        )
-  foreach(CompilerFlag ${CompilerFlags})
-    string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
-  endforeach()
 ELSE(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)
 
 set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
                         -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
                         -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
                         -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                         -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 415e07c75425345f5f1ad29a8544e02a5bfb12e4..ed0bf8396b3faa22350811cf1711f5d1e5b89998 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
     endif()
 endmacro()
 
-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()
 
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -229,20 +215,3 @@ endforeach()
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 
-
-if(WIN32)
-    # windows build turn off warnings.
-    if(MSVC_STATIC_CRT)
-        safe_set_static_flag()
-    endif()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e3c2409f103d36befed29176b354f77257fea9ec..f19f0eb43d34bd0f3748d7beb1fcf403fa1c9037 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -24,7 +24,7 @@ set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_d
 # so the generation of static lib is temporarily turned off.
 if(WIN32)
     #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
+    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
     if(NOT PYTHON_EXECUTABLE)
         FIND_PACKAGE(PythonInterp REQUIRED)
     endif()
@@ -165,25 +165,22 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
     if(WITH_STATIC_LIB)
-        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib)
+        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
     else()
         set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
-                            ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
     endif()
+    copy(inference_lib_dist
+            SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
+            DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
+            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 else(WIN32)
     set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
-endif(WIN32)
-
-if(WIN32 AND NOT WITH_STATIC_LIB)
-        copy(inference_lib_dist
-                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
-                      ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-else()
-        copy(inference_lib_dist
+    copy(inference_lib_dist
                 SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-endif()
+endif(WIN32)
 
 copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -211,12 +208,12 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
 
 set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
-if(WIN32 AND NOT WITH_STATIC_LIB)
+if(WIN32)
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
                 )
-else()
+        else()
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index aea972ab3db2af862f5230ea6c1eabeed8b611c5..21080fbe8fd2e14cf7fd805e01948f2f28535c22 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,8 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"fused_bn_add_activation_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 9dc96fdfe8622e3e78673664637ab50970fe93c6..cf6fcb7b64365b382c648dd83639e0c44670014d 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -44,10 +44,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array 
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
-if(WIN32)
+# TODO(xingzhaolong, jiweibo): remove this and create_static_lib(paddle_fluid) on windows GPU
+if(WIN32 AND WITH_GPU)
   cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) 
 else()
- create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
+  create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
 endif()
 
 if(NOT APPLE AND NOT WIN32)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index ac914700643af2e7e8eca5dcf0bdf8de88e320d6..42e62011f84c18b875a3fa48b95a05f152fb5791 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1048,6 +1048,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
       config);
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index ca0a5148f0622a8c848cb18afb94f600a547bbfe..c78cdf24dec561f5fd5643cb50ee243a58b3ab6a 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -373,6 +373,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
     const NativeConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 }
 
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 08a1a5428193c2d506f511112e4a26d73c382ff1..6a3760e1f749b2b4875df00b01def57c979b3c93 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -51,8 +51,8 @@ if (WIN32)
     set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    safe_set_static_flag()
     if (WITH_STATIC_LIB)
-      safe_set_static_flag()
       add_definitions(-DSTATIC_LIB)
     endif()
   endif()
@@ -136,7 +136,7 @@ else()
   set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
       glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
-  set(DEPS ${DEPS} libcmt shlwapi.lib)
+  set(DEPS ${DEPS} shlwapi.lib)
 endif(NOT WIN32)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 6b7fb0f619a67cc01dac2b09525bb2bfa05207ba..a3e7bec398af7e193a75395ad40175336f5f7503 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -6,7 +6,7 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-
+MSVC_STATIC_CRT=$7
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 
 cd `dirname $0`
@@ -66,43 +66,54 @@ mkdir -p build
 cd build
 rm -rf *
 
-if [ $(echo `uname` | grep "Win") != "" ]; then
-  # -----simple_on_word2vec on windows-----
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=simple_on_word2vec \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  Release/simple_on_word2vec.exe \
-      --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-      --use_gpu=False
-  if [ $? -ne 0 ]; then
-    echo "simple_on_word2vec demo runs fail."
-    exit 1
-  fi
-
-  # -----vis_demo on windows-----
-  rm -rf *
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=vis_demo \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  for vis_demo_name in $vis_demo_list; do
-    Release/vis_demo.exe \
-      --modeldir=$DATA_DIR/$vis_demo_name/model \
-      --data=$DATA_DIR/$vis_demo_name/data.txt \
-      --refer=$DATA_DIR/$vis_demo_name/result.txt \
-      --use_gpu=False
-    if [ $? -ne 0 ]; then
-      echo "vis demo $vis_demo_name runs fail."
-      exit 1
+for WITH_STATIC_LIB in ON OFF; do
+  if [ $(echo `uname` | grep "Win") != "" ]; then
+    # TODO(xingzhaolong, jiweibo): remove this if windows GPU library is ready.
+    if [ $TEST_GPU_CPU == ON] && [ $WITH_STATIC_LIB ==ON ]; then
+      return 0
     fi
-  done
-else
-  for WITH_STATIC_LIB in ON OFF; do
+    
+    # -----simple_on_word2vec on windows-----
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=simple_on_word2vec \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      Release/simple_on_word2vec.exe \
+        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+
+    # -----vis_demo on windows-----
+    rm -rf *
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=vis_demo \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      for vis_demo_name in $vis_demo_list; do
+        Release/vis_demo.exe \
+          --modeldir=$DATA_DIR/$vis_demo_name/model \
+          --data=$DATA_DIR/$vis_demo_name/data.txt \
+          --refer=$DATA_DIR/$vis_demo_name/result.txt \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
+          echo "vis demo $vis_demo_name runs fail."
+          exit 1
+        fi
+      done
+    done
+  else
     # -----simple_on_word2vec on linux/mac-----
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -123,7 +134,6 @@ else
         fi
       done
     fi
-
     # ---------vis_demo on linux/mac---------
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -145,7 +155,6 @@ else
         fi
       done
     done
-
     # --------tensorrt mobilenet on linux/mac------
     if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
       rm -rf *
@@ -167,6 +176,6 @@ else
         exit 1
       fi
     fi
-  done
-fi
+  fi
+done
 set +x
diff --git a/paddle/fluid/inference/api/paddle_infer_declare.h b/paddle/fluid/inference/api/paddle_infer_declare.h
index 39c9653f16cefb71a9f2a0ddcc08723d189d411c..e8525f440fe7f2d54d045eedb79aed228513e550 100644
--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
@@ -17,11 +17,7 @@
 #if defined(_WIN32)
 #ifndef PD_INFER_DECL
 #ifdef PADDLE_DLL_INFERENCE
-#ifndef PADDLE_ON_INFERENCE
-#define PD_INFER_DECL
-#else
 #define PD_INFER_DECL __declspec(dllexport)
-#endif  // PADDLE_ON_INFERENCE
 #else
 #define PD_INFER_DECL __declspec(dllimport)
 #endif  // PADDLE_DLL_INFERENCE
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 31915496893e6242dc7cd10ffd48af278d124245..c1bf4c974fac8c80c3e8e31fbd247332a325e2aa 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -131,7 +131,9 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
   PADDLE_ENFORCE_EQ(
       input_names.size(), in_size,
       paddle::platform::errors::InvalidArgument(
-          "The number of input and the number of model's input must match."));
+          "The number of input and the number of model's input must match. The "
+          "number of input is %d, the number of model's input is %d.",
+          input_names.size(), in_size));
   for (int i = 0; i < in_size; ++i) {
     auto input_t = predictor->GetInputTensor(inputs[i].name);
     std::vector<int> tensor_shape;
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index c497ab384b5fac74b5241d61517485fd8f2b40c4..84e011c6505a8fe974effbecf54101e0e51d29fa 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -47,7 +47,9 @@ void Init(const std::vector<std::string> argv) {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -133,9 +135,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   // model_from_memory is false in separate parameters.
   LoadPersistables(executor, scope, *main_program, dirname, "",
@@ -151,9 +154,10 @@ std::unique_ptr<framework::ProgramDesc> Load(
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_filename,
                    false /* model_from_memory */);
@@ -165,9 +169,10 @@ std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
     const std::string& prog_buffer, const std::string& param_buffer) {
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(prog_buffer));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_buffer,
                    true /* model_filename */);
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
index 76b0832c546b92068364ba6b2eda65a04742e5f0..0bf8a1691e2192b278fcd209162135027ed24e71 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
@@ -27,8 +27,8 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
 
   PADDLE_ENFORCE_EQ(
       Has(plugin_type), true,
-      platform::errors::NotFound(
-          "trt plugin type %s does not exists, check it.", plugin_type));
+      platform::errors::NotFound("TensorRT plugin type `%s` does not exists.",
+                                 plugin_type));
   auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
   owned_plugins_.emplace_back(plugin);
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 6fcb70c6d3299f830e1e95e328b2645aedf9cc31..16751c764bd03af9bbb7cbd77dd9287c17150dd5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -103,12 +103,11 @@ struct Serializer<std::vector<T>,
     DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
-    PADDLE_ENFORCE_GE(
-        *buffer_size, nbyte,
-        platform::errors::InvalidArgument("Expect buffer size >= value size in "
-                                          "trt plugin deserialization, but got "
-                                          "buffer size = %d, value size = %d.",
-                                          *buffer_size, nbyte));
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte,
+                      platform::errors::InvalidArgument(
+                          "Insufficient data in buffer, expect contains %d "
+                          "byte, but actually only contains %d byte.",
+                          *buffer_size, nbyte));
     std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
index 990bef359499834c3a7cb025c3fb1d94ceea958e..6828924c300fdfec6640e7b19a2c06b0826aa455 100644
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -46,7 +46,9 @@ struct Registry {
 
   template <typename ItemChild>
   void Register(const std::string& name) {
-    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    PADDLE_ENFORCE_EQ(items_.count(name), 0,
+                      platform::errors::AlreadyExists(
+                          "Item `%s` has beed registered.", name));
     items_[name] = new ItemChild;
   }
 
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index 629fedba6e3db474869ebddc02470c2ff007e658..e5fcd270eb8b8fa58175e11e955161ebfbb2846c 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -69,12 +69,18 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("alpha", "The scale of Original Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& alpha) {
-          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+          PADDLE_ENFORCE_GE(
+              alpha, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'alpha' must be greater than or equal to 0.0."));
         });
     AddAttr<float>("beta", "The scale of Position Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& beta) {
-          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+          PADDLE_ENFORCE_GE(
+              beta, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'beta' must be greater than or equal to 0.0."));
         });
     AddComment(R"DOC(
     Add Position Encoding Operator.
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index b462c43d23a534c3520a2a852252fe0333222d77..1418d96b67b75ea3a2d4b3d95d3e4bdfb17618ee 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -76,7 +76,10 @@ class AssignValueKernel : public framework::OpKernel<T> {
         value_name = "int64_values";
         break;
       default:
-        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported data type(code %d) for AssignValue operator, only "
+            "supports bool, int32, float32 and int64.",
+            dtype));
         break;
     }
     CopyVecotorToTensor<T>(value_name, out, ctx);
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 5b7bcde21a99f23b653cc8b822aa3e22539e9d82..d67d90c348e6f1db9fff604b3eff7b6a79141d07 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -33,29 +33,37 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     auto out_vars = context.MultiOutputVar("Output");
 
     PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
-                      "The CoalesceTensorOp has no input.");
-    PADDLE_ENFORCE_EQ(
-        in_var_names.size(), out_var_names.size(),
-        "The number of CoalesceTensorOp's input and output is not match.");
+                      platform::errors::InvalidArgument(
+                          "The CoalesceTensor operator has no input."));
+    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of CoalesceTensor operator's input and "
+                          "output is not match, "
+                          "input number is %u, output number is %u.",
+                          in_var_names.size(), out_var_names.size()));
 
     // Input & Output check: only support LoDTensor
     for (size_t i = 0; i < in_var_names.size(); ++i) {
       PADDLE_ENFORCE_NOT_NULL(
           in_vars[i],
-          "The input variable %s of CoalesceTensorOp does not exist.",
-          in_var_names[i]);
+          platform::errors::NotFound("The input variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     in_var_names[i]));
       PADDLE_ENFORCE_NOT_NULL(
           out_vars[i],
-          "The output variable %s of CoalesceTensorOp does not exist.",
-          out_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          in_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The input variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          out_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The output variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
+          platform::errors::NotFound("The output variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     out_var_names[i]));
+      PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The input variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
+      PADDLE_ENFORCE_EQ(out_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The output variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
     }
 
     auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
@@ -64,7 +72,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < in_var_names.size(); ++i) {
         PADDLE_ENFORCE_EQ(
             in_var_names[i], out_var_names[i],
-            "The input and output variable of CoalesceTensorOp is different.");
+            platform::errors::InvalidArgument(
+                "The input and output variable of CoalesceTensor operator is "
+                "different, %dth input is %s, %dth output is %s.",
+                i, in_var_names[i], i, out_var_names[i]));
       }
     } else {
       // Init the output as input
@@ -134,16 +145,25 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
       const size_t &size_of_dtype, const platform::Place &place) const {
-    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
+    PADDLE_ENFORCE_EQ(
+        lod_tensors.size(), var_names.size(),
+        platform::errors::InvalidArgument(
+            "The number of input tensor and variable does not match, the "
+            "number of input tensor is %u, the number of input variable is %u.",
+            lod_tensors.size(), var_names.size()));
     *numel = 0;
     std::stringstream ss;
     ss << "alloc_space_for_vars: ";
     for (size_t i = 0; i < var_names.size(); ++i) {
       PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
-                        "%s is not initialized.", var_names[i]);
+                        platform::errors::InvalidArgument(
+                            "Tensor `%s` is not initialized.", var_names[i]));
 
       auto size = lod_tensors[i]->numel();
-      PADDLE_ENFORCE_GT(size, 0);
+      PADDLE_ENFORCE_GT(
+          size, 0,
+          platform::errors::InvalidArgument(
+              "The number of tensor `%s`'s elements is 0.", var_names[i]));
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
          << " addres:" << lod_tensors[i]->data<void>() << ", ";
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4f337c03599a548ac3d95ddd06c726be30d7c13f..7937e432d22faa3ffd93e46a39b7b1cc5500dbf8 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -78,7 +79,8 @@ class ConcatOp : public framework::OperatorWithKernel {
       }
     }
     if (flag == 0) {
-      PADDLE_THROW("All Inputs of Concat OP are Empty!");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "All Inputs of Concat OP are Empty!"));
     }
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 25b45f281a799ade12ec9cbfb8fb262dbc572196..fac8e24251033c301c911f35dcfd0ddb82b713ce 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -162,7 +162,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
       workspace_size = GetWorkspaceSize(args, algo);
 
       if (workspace_size > workspace_size_limit) {
+#if CUDNN_VERSION >= 8000
         workspace_size_limit = workspace_size;
+#else
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size << ") exceeds the limit("
+                << workspace_size_limit << ")";
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+                args.handle, args.idesc.desc(), args.wdesc.desc(),
+                args.cdesc.desc(), args.odesc.desc(),
+                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &algo));
+#endif
       }
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -291,8 +304,23 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 #endif
       workspace_size = GetWorkspaceSize(args, algo);
       if (workspace_size > workspace_size_limit) {
-        workspace_size_limit = workspace_size;
         has_got_workspace_size = false;
+#if CUDNN_VERSION >= 8000
+        // There is no cudnnGetConvolutionBackwardDataAlgorithm in CUDNN 8
+        // version.
+        workspace_size_limit = workspace_size;
+#else
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size << ") exceeds the limit("
+                << workspace_size_limit << ")";
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+                args.handle, args.wdesc.desc(), args.odesc.desc(),
+                args.cdesc.desc(), args.idesc.desc(),
+                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &algo));
+#endif
       }
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index 48743f2e48c8a7686497adff52f23f31346aeda7..0d4d68d9f622fef9df4819d6092411a4d7db65f7 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -45,10 +45,8 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of DequantizeMaxAbsOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of DequantizeMaxAbsOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DequantizeMaxAbs");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DequantizeMaxAbs");
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index b46d231d0ff7774c64745b3b77953cf2ed8d82f7..6b1b0cd8b3578a344978afae642b66759589ffde 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -532,7 +532,8 @@ static int count_contours(polygon_node *polygon) {
 }
 
 static void add_left(polygon_node *p, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   vertex_node *nv = NULL;
 
   /* Create a new vertex node and set its fields */
@@ -588,7 +589,8 @@ static void add_right(polygon_node *p, double x, double y) {
 }
 
 static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   polygon_node *target = NULL;
 
   /* Label contour as external */
@@ -664,7 +666,8 @@ void add_vertex(vertex_node **t, double x, double y) {
 }
 
 void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(e);
+  PADDLE_ENFORCE_NOT_NULL(e, paddle::platform::errors::InvalidArgument(
+                                 "Input edge node is nullptr."));
   add_vertex(&(e->outp[p]->v[s]), x, y);
   e->outp[p]->active++;
 }
@@ -693,7 +696,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
 
   gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                    const_cast<char *>("Bounding box creation"));
-  PADDLE_ENFORCE_NOT_NULL(box);
+  PADDLE_ENFORCE_NOT_NULL(box, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc box memory."));
 
   /* Construct contour bounding boxes */
   for (c = 0; c < p->num_contours; c++) {
@@ -857,7 +861,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
   /* Create an extended hole array */
   gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                   const_cast<char *>("contour hole addition"));
-  PADDLE_ENFORCE_NOT_NULL(extended_hole);
+  PADDLE_ENFORCE_NOT_NULL(extended_hole,
+                          paddle::platform::errors::ResourceExhausted(
+                              "Failed to malloc extended hole memory."));
 
   /* Create an extended contour array */
   gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -975,7 +981,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
+
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1017,7 +1025,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e0 = aet;
     e1 = aet;
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
+
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
@@ -1612,7 +1622,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1650,7 +1661,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e1 = aet;
 
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
index 4386cc6b8183c03b4d4a19aba7d1126eac2ab495..12ea31945f8d032e1f395c2fb92d9ef31d10c7e8 100644
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <tuple>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_v2_op.h"
 
@@ -58,6 +59,17 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
     auto out_dims = out->dims();
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
+    auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+      const int64_t block_size =
+          std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+      int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
+                                          static_cast<int64_t>(1));
+      const int64_t grid_size =
+          std::min(max_blocks, (size + block_size - 1) / block_size);
+      return std::tuple<int64_t, int64_t>{block_size, grid_size};
+    };
+
     if (x_dims.size() == 1) {
       float padding_value = context.Attr<float>("padding_value");
       math::SetConstant<DeviceContext, T> set_padding_value;
@@ -67,26 +79,23 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
       auto size = (offset > 0) ? x_length + offset : x_length - offset;
       const int& x_stride = ComputeStride(0, x_dims);
       if (size > 0) {
-        const int block_num = std::min(static_cast<int>(size),
-                                       dev_ctx.GetMaxPhysicalThreadCount());
-        int size_ = static_cast<int>(size);
-        int block_num_ = static_cast<int>(block_num);
-        const int grid_num =
-            std::min(1024, (size_ + block_num_ - 1) / block_num_);
         const auto& out_stride_0 = ComputeStride(0, out_dims);
         const auto& out_stride_1 = ComputeStride(1, out_dims);
         auto start =
             (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
 
-        PasteDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
-            out_data, x_data, start, x_length, out_stride_0 + out_stride_1,
-            x_stride);
+        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+        PasteDiagonalKernel<
+            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
+                 dev_ctx.stream()>>>(out_data, x_data, start, x_length,
+                                     out_stride_0 + out_stride_1, x_stride);
       }
     } else {
       const int& x_stride_0 = ComputeStride(0, x_dims);
       const int& x_stride_1 = ComputeStride(1, x_dims);
 
-      int size;
+      int64_t size;
       if (offset > 0) {
         size = std::min(x_dims[0], x_dims[1] - offset);
       } else {
@@ -94,18 +103,15 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
       }
 
       if (size > 0) {
-        const int block_num = std::min(static_cast<int>(size),
-                                       dev_ctx.GetMaxPhysicalThreadCount());
-        int size_ = static_cast<int>(size);
-        int block_num_ = static_cast<int>(block_num);
-        const int grid_num =
-            std::min(1024, (size_ + block_num_ - 1) / block_num_);
         auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
         const auto& out_stride_0 = ComputeStride(0, out_dims);
 
-        ExtractDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
-            out_data, x_data, start, size, x_stride_0 + x_stride_1,
-            out_stride_0);
+        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+        ExtractDiagonalKernel<
+            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
+                 dev_ctx.stream()>>>(out_data, x_data, start, size,
+                                     x_stride_0 + x_stride_1, out_stride_0);
       }
     }
   }
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index b2cc9390fa2267404ac246c6b36800833d0dd679..a0ac82a6f4a432ee0f0427a90508c88a262799e3 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -74,8 +74,12 @@ void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
   } else {
     recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
   }
+
+  InitParams();
 }
 
+void AsyncCommunicator::InitParams() { RecvNoBarrier(); }
+
 AsyncCommunicator::~AsyncCommunicator() {
   running_ = false;
   if (main_thread_) main_thread_->join();
@@ -157,16 +161,18 @@ void AsyncCommunicator::MainThread() {
   }
 
   while (running_) {
-    int meet = Meet();
-
-    VLOG(1) << "async_meet: " << meet;
-
-    SendGlobalStep(meet);
-    SendByCommunicator(meet);
-    BarrierSend();
-    RecvByCommunicator();
-    BarrierRecv();
-    BarrierWeakUp();
+    int batches = BatchesCounter();
+
+    if (batches > 0) {
+      SendGlobalStep(batches);
+      SendByCommunicator(batches);
+      BarrierSend();
+      RecvByCommunicator();
+      BarrierRecv();
+      BarrierWeakUp();
+    } else {
+      VLOG(1) << "get nothing from sending queue, will skip send/recv";
+    }
   }
   VLOG(1) << "communicator stopped, send thread exit";
 }
@@ -187,7 +193,7 @@ void AsyncCommunicator::RecvNoBarrier() {
       auto &var_name = iter.first;
       VLOG(4) << "recv var " << var_name;
       auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(iter.second, *recv_scope_, false);
+      recv_functor(iter.second, *recv_scope_);
     };
     task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
   }
@@ -197,7 +203,7 @@ void AsyncCommunicator::RecvNoBarrier() {
   }
 }
 
-int AsyncCommunicator::Meet() {
+int AsyncCommunicator::BatchesCounter() {
   auto &step_queue = send_varname_to_queue_.at(STEP_COUNTER);
 
   size_t merged_var_num = 0;
@@ -316,7 +322,7 @@ void HalfAsyncCommunicator::Clean() {
   }
 }
 
-int HalfAsyncCommunicator::Meet() {
+int HalfAsyncCommunicator::BatchesCounter() {
   while (running_) {
     if (barrier_counter_.load() >= barrier_trigger_.load() &&
         barrier_trigger_.load() != 0) {
@@ -443,7 +449,7 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
   old_scope_.reset(new Scope());
   pserver_scope_.reset(new Scope());
 
-  Init();
+  InitParams();
 }
 
 void GeoCommunicator::Send(const std::vector<std::string> &var_names,
@@ -626,9 +632,7 @@ void GeoCommunicator::RecvByCommunicator() {
       if (recv_ctx.is_sparse) {
         RecvSparse(var_name);
       } else {
-        VLOG(1) << "recv dense " << var_name << " begin";
         RecvDense(var_name);
-        VLOG(1) << "recv dense " << var_name << " done";
       }
     };
     tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
@@ -696,7 +700,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
 
   auto &ctx = recv_varname_to_ctx_.at(varname);
   auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *pserver_scope_, true);
+  recv(ctx, *pserver_scope_);
 
   PADDLE_ENFORCE_EQ(
       var_psrever->IsInitialized(), true,
@@ -721,7 +725,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
              t_timestamp->data<float>());
 }
 
-void GeoCommunicator::Init() {
+void GeoCommunicator::InitParams() {
   std::vector<std::future<void>> tasks;
   tasks.reserve(recv_varname_to_ctx_.size());
 
@@ -744,12 +748,17 @@ void GeoCommunicator::Init() {
 }
 
 void GeoCommunicator::InitDense(const std::string varname) {
-  auto *var = old_scope_->Var(varname);
-  var->GetMutable<framework::LoDTensor>();
-
   auto &ctx = recv_varname_to_ctx_.at(varname);
   auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *old_scope_);
+  recv(ctx, *recv_scope_);
+
+  auto *global_var = recv_scope_->FindVar(varname);
+  global_var->GetMutable<framework::LoDTensor>();
+
+  auto *old_var = old_scope_->Var(varname);
+  old_var->GetMutable<framework::LoDTensor>();
+
+  framework::CopyVariable(*global_var, old_var);
   VLOG(1) << "init dense variable " << varname << " done";
 }
 
@@ -781,22 +790,41 @@ void GeoCommunicator::InitSparse() {
 
   LargeScaleKV::Init(metas);
 
-  for (size_t i = 0; i < metas.size(); i++) {
-    auto &varname = metas[i].name;
-    auto &dict = dicts[i];
+  for (auto &meta : metas) {
+    auto &ctx = recv_varname_to_ctx_.at(meta.name);
+    auto recv = distributed::ParameterRecv<float>();
 
-    std::vector<int64_t> ids;
-    ids.reserve(dict);
+    auto *global_var = recv_scope_->FindVar(meta.name);
+    auto global_value = global_var->Get<framework::LoDTensor>();
+    auto rows = global_value.dims()[0];
+    auto dim1 = global_value.dims()[1];
 
-    for (auto j = 0; j < dict; ++j) {
-      ids.push_back(j);
-    }
+    recv(ctx, *recv_scope_);
+    VLOG(1) << "recv " << meta.name << " with global scope for init";
+
+    auto n_rows = global_var->Get<framework::LoDTensor>().dims()[0];
+
+    PADDLE_ENFORCE_EQ(
+        rows, n_rows,
+        platform::errors::InvalidArgument(
+            "global var: %s origin dim must equal recved rows", meta.name));
+
+    std::vector<int64_t> ids(rows);
+    std::iota(ids.begin(), ids.end(), 0);
 
     auto *ins = distributed::LargeScaleKV::GetInstance();
-    ins->Get(varname)->Init(ids);
+    std::vector<std::vector<std::vector<float> *>> values;
+
+    ins->Get(meta.name)->Init(ids);
+    ins->Get(meta.name)->Get(ids, {"Param"}, &values);
 
-    VLOG(3) << "GeoCommunicator init sparse " << varname << " with size "
-            << ids.size();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, float>(
+        paddle::platform::CPUDeviceContext());
+
+    for (auto &id : ids) {
+      blas.VCOPY(dim1, global_value.data<float>() + id * dim1,
+                 values[id][0]->data());
+    }
   }
 
   VLOG(3) << "init sparse variable done";
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 2f6da150d1e1375c332f7e55ea5b16c07f067a40..4a9a9eb1701f5a9102de9de164a7679999ee2a3e 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <deque>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -29,6 +30,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/large_scale_kv.h"
@@ -279,6 +281,8 @@ class AsyncCommunicator : public Communicator {
                 const RpcCtxMap &recv_varname_to_ctx,
                 Scope *recv_scope) override;
 
+  void InitParams();
+
   void MainThread();
 
   void Send(const std::vector<std::string> &var_names,
@@ -293,7 +297,7 @@ class AsyncCommunicator : public Communicator {
 
   virtual void RecvNoBarrier();
 
-  virtual int Meet();
+  virtual int BatchesCounter();
 
   virtual void BarrierSend() {}
 
@@ -350,7 +354,7 @@ class HalfAsyncCommunicator : public AsyncCommunicator {
 
   void BarrierTriggerReset(int initial_val) override;
 
-  int Meet();
+  int BatchesCounter();
 
   void BarrierWeakUp();
 
@@ -435,7 +439,7 @@ class GeoCommunicator : public AsyncCommunicator {
 
   void RecvDense(const std::string &varname);
 
-  void Init();
+  void InitParams();
 
   void InitSparse();
 
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 5409ec54987fbb7ad89f61cc1655a4c3ef302ac0..3b8479c91b0b619430ebde26b26f0ae6c9fc59cb 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -41,8 +41,67 @@ using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
 template <typename T>
-void RecvSelectedRows(const CommContext &rpc_ctx,
-                      const framework::Scope &scope) {
+void RecvSparseLodTensor(const CommContext &rpc_ctx,
+                         const framework::Scope &scope) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto cpu_place = platform::CPUPlace();
+  auto &cpu_ctx = *pool.Get(cpu_place);
+
+  distributed::RPCClient *rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
+
+  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
+  std::vector<const float *> tensors;
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
+    auto &recv_var_name = rpc_ctx.splited_varnames[i];
+    auto *local_var = local_scope->Var(recv_var_name);
+    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
+    // sparse param in recv_scope is LoDTensor
+    rets.push_back(rpc_client->AsyncGetVarNoBarrier(
+        rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
+        recv_var_name));
+
+    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
+    tensors.push_back(value);
+  }
+
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
+                                               "internal error in RPCClient"));
+  }
+
+  auto *merged_var = scope.FindVar(rpc_ctx.var_name);
+
+  if (merged_var == nullptr || !merged_var->IsInitialized()) {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("%s must initialized at first."));
+  }
+  auto dims1 = merged_var->Get<framework::LoDTensor>().dims()[1];
+  int64_t height = 0;
+  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
+    auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]);
+    height += splited_var->Get<framework::LoDTensor>().dims()[0];
+  }
+
+  PADDLE_ENFORCE_EQ(merged_var->Get<framework::LoDTensor>().dims()[0], height,
+                    "recved var must has same dims with local var");
+
+  auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
+  auto *merged_d = merged_t->mutable_data<float>(cpu_place);
+
+  auto pserver_num = rpc_ctx.splited_varnames.size();
+  for (int x = 0; x < height; ++x) {
+    auto id = x % pserver_num;
+    auto idx = x / pserver_num;
+    std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1,
+                sizeof(float) * dims1);
+  }
+}
+
+template <typename T>
+void RecvGeoSparseRecords(const CommContext &rpc_ctx,
+                          const framework::Scope &scope) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto cpu_place = platform::CPUPlace();
   auto &cpu_ctx = *pool.Get(cpu_place);
@@ -84,9 +143,14 @@ void RecvSelectedRows(const CommContext &rpc_ctx,
     ids_num += recv_t.rows().size();
     width = recv_t.value().dims()[1];
 
-    std::transform(recv_t.rows().begin(), recv_t.rows().end(),
-                   std::back_inserter(all_ids),
-                   [&](int64_t id) { return id * pserver_num + i; });
+    if (rpc_ctx.is_distributed) {
+      std::copy(recv_t.rows().begin(), recv_t.rows().end(),
+                std::back_inserter(all_ids));
+    } else {
+      std::transform(recv_t.rows().begin(), recv_t.rows().end(),
+                     std::back_inserter(all_ids),
+                     [&](int64_t id) { return id * pserver_num + i; });
+    }
   }
 
   auto *var = scope.FindVar(rpc_ctx.var_name);
@@ -146,7 +210,8 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
 
 template <typename T>
 void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope, bool barrier) {
+                                  const framework::Scope &scope,
+                                  bool geo_records) {
   VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
 
   PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1,
@@ -154,18 +219,21 @@ void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
                         "origin_varnames.size() >= 1 is permitted"));
 
   if (rpc_ctx.is_sparse) {
-    RecvSelectedRows<T>(rpc_ctx, scope);
+    if (geo_records) {
+      RecvGeoSparseRecords<T>(rpc_ctx, scope);
+    } else {
+      RecvSparseLodTensor<T>(rpc_ctx, scope);
+    }
   } else {
     RecvLodTensor<T>(rpc_ctx, scope);
   }
 
   VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
 }
-
 template <typename T>
 void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
                                   const framework::Scope &scope) {
-  this->operator()(rpc_ctx, scope, true);
+  this->operator()(rpc_ctx, scope, false);
 }
 
 template struct ParameterRecv<float>;
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index b064265917b2a36b2261c6c43d355f9891aa9811..c9f9daf3b3c0442e379cd7a22fcf48dbe3acbb5d 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -48,7 +48,9 @@ class FetchBarrierOp : public framework::OperatorBase {
     }
 
     for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
+      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
+                        platform::errors::Unavailable(
+                            "Internal error occurred in RPCClient."));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e53ce8cc67c08269e15a20e2cd2fc57a2c5ace17
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h"
+
+#include <string>
+namespace paddle {
+namespace operators {
+
+class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of LargeScaleFuseAdamOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of LargeScaleFuseAdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+
+    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
+                      "Maybe the Input variable LearningRate has not "
+                      "been initialized. You may need to confirm "
+                      "if you put exe.run(startup_program) "
+                      "after optimizer.minimize function.");
+
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "LearningRate") {
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class LargeScaleFuseAdamOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto in_var_type = ctx->GetInputType("Grad");
+    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                          in_var_type == framework::proto::VarType::LOD_TENSOR,
+                      true, platform::errors::InvalidArgument(
+                                "The input Var's type should be LoDtensor or "
+                                "SelectedRows, but the received type is %s",
+                                in_var_type));
+  }
+};
+
+class LargeScaleFuseAdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Grad",
+             "(SelectedRows) Ids's type should be SelectedRows"
+             "THe ids to be looked up in W.");
+
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddAttr<bool>("is_entry",
+                  "(bool)"
+                  "sparse table need entry");
+
+    AddAttr<std::string>("tablename",
+                         "(string)"
+                         "sparse table name");
+
+    AddAttr<std::vector<std::string>>("value_names",
+                                      "(strings)"
+                                      "sparse table name");
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    lookup_sparse_table_fuse_adam, ops::LargeScaleFuseAdamOp,
+    ops::LargeScaleFuseAdamOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::LargeScaleFuseAdamOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    lookup_sparse_table_fuse_adam,
+    ops::LargeScaleFuseAdamOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..89b8d54a463b03076c9489b842540ea4a4f68a82
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <math.h>  // for sqrt in CPU and CUDA
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LargeScaleFuseAdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override;
+};
+
+template <typename T>
+class LargeScaleFuseAdamOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using paddle::framework::LoDTensor;
+
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    PADDLE_ENFORCE(
+        grad_var->IsType<framework::SelectedRows>(),
+        platform::errors::InvalidArgument(
+            "in large scale optimize, gradient should only be SelectedRows"));
+
+    const auto &grad = grad_var->Get<framework::SelectedRows>();
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad.rows().size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows tmp_grad_merge;
+    const framework::SelectedRows *grad_merge_ptr;
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
+               &tmp_grad_merge, true);
+    grad_merge_ptr = &tmp_grad_merge;
+
+    std::vector<int64_t> in_rows;
+    in_rows.reserve(grad_merge_ptr->rows().size());
+    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
+              std::back_inserter(in_rows));
+
+    const auto *lr = learning_rate->data<T>();
+    auto grad_v = grad_merge_ptr->value();
+    auto grad_width = grad_v.dims()[1];
+
+    //    auto is_entry = context.Attr<bool>("is_entry");
+    auto tablename = ctx.Attr<std::string>("tablename");
+    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
+
+    auto *beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto *beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+    auto *beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto *beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    // update beta1 and beta2
+    beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+        beta1 * beta1_pow->data<T>()[0];
+    beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+        beta2 * beta2_pow->data<T>()[0];
+
+    std::vector<std::vector<std::vector<float> *>> values;
+    std::vector<int64_t> dims;
+
+    auto *ins = distributed::LargeScaleKV::GetInstance();
+    auto *table = ins->Get(tablename);
+    table->Get(in_rows, value_names, &values);
+    table->Dims({"Param"}, &dims);
+
+    PADDLE_ENFORCE_EQ(dims[0], grad_width,
+                      platform::errors::InvalidArgument(
+                          "param_row should have the same size with grad_row"));
+
+    T lr_ = lr[0];
+    T beta1_pow_ = beta1_pow->data<T>()[0];
+    T beta2_pow_ = beta2_pow->data<T>()[0];
+
+    lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
+
+    for (size_t i = 0; i < in_rows.size(); i++) {
+      auto &params = values[i][0];
+      auto &moment_1 = values[i][1];
+      auto &moment_2 = values[i][2];
+
+      auto *p_data = params->data();
+      auto *m1_data = moment_1->data();
+      auto *m2_data = moment_2->data();
+
+      for (int x = 0; x < grad_width; ++x) {
+        auto g = grad_v.data<T>()[grad_width * i + x];
+        m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g;
+        m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g;
+        p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon));
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..010658b5280d7feeb683112b401dbcaaa265daac
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h"
+
+#include <string>
+namespace paddle {
+namespace operators {
+
+class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of LargeScaleFuseSGDOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of LargeScaleFuseSGDOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+
+    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
+                      "Maybe the Input variable LearningRate has not "
+                      "been initialized. You may need to confirm "
+                      "if you put exe.run(startup_program) "
+                      "after optimizer.minimize function.");
+
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "LearningRate") {
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class LargeScaleFuseSGDOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto in_var_type = ctx->GetInputType("Grad");
+    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                          in_var_type == framework::proto::VarType::LOD_TENSOR,
+                      true, platform::errors::InvalidArgument(
+                                "The input Var's type should be LoDtensor or "
+                                "SelectedRows, but the received type is %s",
+                                in_var_type));
+  }
+};
+
+class LargeScaleFuseSGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Grad",
+             "(SelectedRows) Ids's type should be SelectedRows"
+             "THe ids to be looked up in W.");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddAttr<bool>("is_entry",
+                  "(bool)"
+                  "sparse table need entry");
+
+    AddAttr<std::string>("tablename",
+                         "(string)"
+                         "sparse table name");
+
+    AddAttr<std::vector<std::string>>("value_names",
+                                      "(strings)"
+                                      "sparse table name");
+
+    AddComment(R"DOC(
+
+LargeScaleFuseSGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+$$param\_out = param - learning\_rate * grad$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    lookup_sparse_table_fuse_sgd, ops::LargeScaleFuseSGDOp,
+    ops::LargeScaleFuseSGDOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::LargeScaleFuseSGDOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    lookup_sparse_table_fuse_sgd,
+    ops::LargeScaleFuseSGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d4bf1015fa3a8c2c8fb102fcd890f41b296269d
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LargeScaleFuseSGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override;
+};
+
+template <typename T>
+class LargeScaleFuseSGDOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    PADDLE_ENFORCE(
+        grad_var->IsType<framework::SelectedRows>(),
+        platform::errors::InvalidArgument(
+            "in large scale optimize, gradient should only be SelectedRows"));
+
+    const auto &grad = grad_var->Get<framework::SelectedRows>();
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad.rows().size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows tmp_grad_merge;
+    const framework::SelectedRows *grad_merge_ptr;
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
+               &tmp_grad_merge, true);
+    grad_merge_ptr = &tmp_grad_merge;
+
+    std::vector<int64_t> in_rows;
+    in_rows.reserve(grad_merge_ptr->rows().size());
+    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
+              std::back_inserter(in_rows));
+
+    const auto *lr = learning_rate->data<T>();
+    auto grad_v = grad_merge_ptr->value();
+    auto grad_width = grad_v.dims()[1];
+
+    //    auto is_entry = context.Attr<bool>("is_entry");
+    auto tablename = ctx.Attr<std::string>("tablename");
+    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
+
+    std::vector<std::vector<std::vector<float> *>> values;
+    std::vector<int64_t> dims;
+
+    auto *ins = distributed::LargeScaleKV::GetInstance();
+    auto *table = ins->Get(tablename);
+    table->Get(in_rows, value_names, &values);
+    table->Dims({"Param"}, &dims);
+
+    PADDLE_ENFORCE_EQ(dims[0], grad_width,
+                      platform::errors::InvalidArgument(
+                          "param_row should have the same size with grad_row"));
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+
+    std::vector<T> grads;
+    framework::TensorToVector(grad_v, ctx.device_context(), &grads);
+
+    blas.SCAL(grads.size(), lr[0], grads.data());
+
+    for (int x = 0; x < static_cast<int>(in_rows.size()); ++x) {
+      auto &params = values[x][0];
+      blas.VSUB(grad_width, params->data(), grads.data() + grad_width * x,
+                params->data());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 15b36baeada300e1ab472737b4e35538f9882cb7..2547ba3acb16031245ceae622e11893597bb9b9b 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -37,12 +37,6 @@ class RecvOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    int do_not_run = Attr<int>("do_not_run");
-    if (do_not_run) {
-      VLOG(3) << "recv do not run!";
-      return;
-    }
-
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
     std::vector<std::string> varnames =
         Attr<std::vector<std::string>>("varnames");
@@ -63,11 +57,10 @@ class RecvOp : public framework::OperatorBase {
     if (recv_varnames.size() > 0) {
       auto *communicator = distributed::Communicator::GetInstance();
 
-      if (communicator == nullptr) {
+      if (communicator != nullptr) {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "need run fleet.init_worker first"));
+            "execute startup program must before fleet.init_worker"));
       }
-      communicator->RecvNoBarrier();
     } else {
       std::vector<distributed::VarHandlePtr> rets;
       if (with_barrier) {
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
index c05a1ff1da8803c1ef3161d0e9d8604f9f1e5f3b..7dc0596ac31e2506ae02de11b33bd0532f02cc7a 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -34,16 +34,16 @@ inline bool NeedSend(const framework::Scope& scope,
       std::string::npos)
     return false;
   auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound(
+               "Can not find variable '%s' in the send side.", varname));
   if (var->IsType<framework::LoDTensor>()) {
     return var->Get<framework::LoDTensor>().IsInitialized();
   } else if (var->IsType<framework::SelectedRows>()) {
     return var->Get<framework::SelectedRows>().rows().size() > 0UL;
   } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Variable type in send side should be LodTensor or SelectedRows."));
   }
   return false;
 }
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 3fc5f3bfc6b1633ffe835606bbac6118e6b32ca6..477a9162fe3f779d4006deb2e20b3a16f70cdf47 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -8,7 +8,8 @@ register_operators(EXCLUDES
     multihead_matmul_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
-    fusion_gru_op)
+    fusion_gru_op
+    fused_bn_add_activation_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -47,4 +48,9 @@ if (WITH_GPU)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
         cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
     endif()
+    # fused_bn_add_activation
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
+    op_library(fused_bn_add_activation_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index b22f28fbbe3ce8ce178a3d9c17a048817cb750e7..49fded886a0339a0456ee55d0d4d1249461f93b9 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -204,6 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto x_dims = framework::vectorize(transformed_input.dims());
     auto f_dims = framework::vectorize(filter->dims());
     if (!exhaustive_search) {
+#if CUDNN_VERSION >= 8000
       int perf_count;
       int best_algo_idx = 0;
       size_t tmp_size = 0;
@@ -215,13 +216,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-      VLOG(3) << "cuDNN forward algo " << algo;
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, algo, &workspace_size_in_bytes));
       if (workspace_size_in_bytes > workspace_size_limit)
         workspace_size_limit = workspace_size_in_bytes;
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+              handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+              cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &algo));
+      VLOG(3) << "cuDNN forward algo " << algo;
+#endif
     } else {
       std::function<cudnnConvolutionFwdAlgo_t()> search_func =
           [&]() -> cudnnConvolutionFwdAlgo_t {
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b3ed03bb6419cd3c36f6ee2e856f1816d314c75
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+void FusedBatchNormAddActOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias",
+                 "FusedBatchNormAddActOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
+                 "FusedBatchNormAddActOp");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const auto z_dims = ctx->GetInputDim("Z");
+  PADDLE_ENFORCE_EQ(x_dims, z_dims,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the shapes of input "
+                        "must be equal. But received: the shape "
+                        "of input X = [%s], and the shape of "
+                        "input Y = [%s]",
+                        x_dims, z_dims));
+  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must greater than or equal to 2."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must smaller than or equal to 5."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+
+  const int64_t C = x_dims[x_dims.size() - 1];
+
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
+
+  PADDLE_ENFORCE_EQ(
+      scale_dim.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim, scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim, bias_dim.size()));
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C, scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C, bias_dim[0]));
+  }
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should be float when input tensor's dtype is float16.
+  auto bn_param_type = framework::proto::VarType::FP32;
+
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+      platform::errors::InvalidArgument("Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+      platform::errors::InvalidArgument("Bias input should be of float type"));
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+void FusedBatchNormAddActOpMaker::Make() {
+  AddInput("X", "The input tensor");
+  AddInput("Z", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel");
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be between 0.0 and 0.001."));
+      });
+  AddAttr<std::string>("act_type", "The activation type to be fused.")
+      .SetDefault("relu");
+  AddComment(R"DOC(
+Fused Batch Normalization with activation.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
+Now, the required data format for FusedBatchNormAddActOp is NHWC `[batch, in_height, in_width, in_channels]`.
+
+)DOC");
+}
+
+void FusedBatchNormAddActGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  // check input
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
+                 framework::GradVarName("Y"), "FusedBatchNormAddActGradOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                 framework::GradVarName("X"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output",
+                 framework::GradVarName("Z"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale")), "Output",
+                 framework::GradVarName("Scale"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output",
+                 framework::GradVarName("Bias"), "FusedBatchNormAddActGradOp");
+
+  const auto in_dims = ctx->GetInputDim("X");
+  const int C = in_dims[in_dims.size() - 1];
+
+  ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Z"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+  ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+}
+
+framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW(platform::errors::NotFound(
+        "Can not find Y@GRAD in the execution context."));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+  }
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bn_add_activation, ops::FusedBatchNormAddActOp,
+    ops::FusedBatchNormAddActOpMaker, ops::FusedBatchNormAddActOpInferVarType,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bn_add_activation_grad,
+                  ops::FusedBatchNormAddActGradOp);
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f1d297cda3fae54cdde089f25ccdf6715142c5f
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -0,0 +1,338 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    float momentum = ctx.Attr<float>("momentum");
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    // Get the size for each dimension.
+    // NHWC [batch_size, in_height, in_width, in_channels]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto &in_dims = x->dims();
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // ------------------- cudnn descriptors ---------------------
+    auto handle = dev_ctx.cudnn_handle();
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    double this_factor = 1. - momentum;
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    size_t workspace_size = 0;
+    size_t reserve_space_size = 0;
+    void *reserve_space_ptr = nullptr;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    // Create reserve space and workspace for batch norm.
+    // Create tensor for each batchnorm op, it will be used in the
+    // backward. Thus this tensor shouldn't be temp.
+    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    PADDLE_ENFORCE_NOT_NULL(
+        reserve_space,
+        platform::errors::NotFound(
+            "The argument ReserveSpace of batch_norm op is not found."));
+
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::
+            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                /*handle=*/handle,
+                /*mode=*/mode_,
+                /*bnOps=*/bnOps_,
+                /*xDesc=*/data_desc_,
+                /*zDesc=*/data_desc_,
+                /*yDesc=*/data_desc_,
+                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                /*activationDesc=*/activation_desc_,
+                /*sizeInBytes=*/&workspace_size));
+
+    // -------------- cudnn batchnorm reserve space --------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+            /*handle=*/handle,
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*activationDesc=*/activation_desc_,
+            /*xDesc=*/data_desc_,
+            /*sizeInBytes=*/&reserve_space_size));
+
+    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
+                                                    reserve_space_size);
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+            handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+            data_desc_, z->template data<T>(), data_desc_,
+            y->template data<T>(), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
+            reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+template <typename T>
+class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto *y = ctx.Input<Tensor>("Y");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    const auto &in_dims = x->dims();
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_z = ctx.Output<Tensor>(framework::GradVarName("Z"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_z->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        d_scale && d_bias, true,
+        platform::errors::PreconditionNotMet(
+            "Both the scale grad and the bias grad must not be null."));
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL,
+                      platform::errors::PreconditionNotMet(
+                          "The scale only has one dimension."));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::PreconditionNotMet(
+            "The size of scale is equal to the channel of Input(X)."));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();
+
+    size_t workspace_size = 0;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    auto reserve_space_size = reserve_space->memory_size();
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*xDesc=*/data_desc_,
+            /*yDesc=*/data_desc_,
+            /*dyDesc=*/data_desc_,
+            /*dzDesc=*/data_desc_,
+            /*dxDesc=*/data_desc_,
+            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+            /*activationDesc=*/activation_desc_,
+            /*sizeInBytes=*/&workspace_size));
+
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationBackwardEx(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+            /*xDesc=*/data_desc_,
+            /*xData=*/x->template data<T>(),
+            /*yDesc=*/data_desc_,
+            /*yData=*/y->template data<T>(),
+            /*dyDesc=*/data_desc_,
+            /*dyData=*/d_y->template data<T>(),
+            /*dzDesc=*/data_desc_,
+            /*dzData=*/d_z->template data<T>(),
+            /*dxDesc=*/data_desc_,
+            /*dxData=*/d_x->template data<T>(),
+            /*dBnScaleBiasDesc=*/bn_param_desc_,
+            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
+            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
+            /*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
+            /*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
+            /*epsilon=*/epsilon,
+            /*savedMean=*/saved_mean_data,
+            /*savedInvVariance=*/saved_var_data,
+            /*activationDesmc=*/activation_desc_,
+            /*workspace=*/workspace_ptr,
+            /*workSpaceSizeInBytes=*/workspace_size,
+            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7401
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    fused_bn_add_activation,
+    ops::FusedBatchNormAddActKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(fused_bn_add_activation_grad,
+                        ops::FusedBatchNormAddActGradKernel<
+                            plat::CUDADeviceContext, plat::float16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c7df96e60dd89b74058ead837bb75555f3674ad
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+template <typename T>
+class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Z", this->Input("Z"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  }
+};
+
+class FusedBatchNormAddActOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+    return m;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 712ef05d8631ac74b92795321202cb5590286e82..4865a02c5292ffb9d079d0711f0bf7d6e927c441 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -47,7 +47,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluFunctor<T>()(d, x, y);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
@@ -137,7 +139,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 1e99e22e12b2a23685dad742f175fd2b0684d334..e8a9ed878e9bd502b9bd7e7d82f574fb5740bb5d 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -104,12 +104,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
   auto dim_x = ctx->GetInputDim("X");
   auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
 
-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method ||
-          "bicubic" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4, but got method = %s .",
-      interp_method);
+  PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
+                        "bicubic" == interp_method,
+                    true, platform::errors::InvalidArgument(
+                              "Interpolation method can only be \"bilinear\" "
+                              "or \"nearest\" or \"bicubic\" when "
+                              "Input(X) dimension is 4, but got method is %s.",
+                              interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
@@ -169,13 +170,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(), 1,
-        platform::errors::InvalidArgument(
-            "OutSize's dimension size must be 1, but got dimension = %d .",
-            out_size_dim.size()));
+        platform::errors::InvalidArgument("OutSize's dimension size must be 1, "
+                                          "but got dimension size is %d .",
+                                          out_size_dim.size()));
     PADDLE_ENFORCE_EQ(
         out_size_dim[0], 2,
         platform::errors::InvalidArgument(
-            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            "OutSize's dimension[0] must be 2, but got dimension[0] is %d .",
             out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
@@ -264,12 +265,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1, but got size =%d .",
-                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got size is %d.",
+            out_size_dim.size()));
     PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
-                      "OutSize's dim[0] must be 3, but got size = %d .",
-                      out_size_dim[0]);
+                      platform::errors::InvalidArgument(
+                          "OutSize's dim[0] must be 3, but got size is %d.",
+                          out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
@@ -289,10 +293,8 @@ class InterpolateOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of InterpolateOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of InterpolationOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Interpolate");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Interpolate");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
     PADDLE_ENFORCE(
@@ -534,9 +536,10 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InterpolateGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "InterpolateGrad");
+
     auto dim_x = ctx->GetInputDim("X");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 667c6e892956e29478f1401c3cb2622713433037..7cc07383bfa5f67a2404b220cb481d9017b40fd8 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/linspace_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -21,7 +22,7 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
@@ -50,11 +51,17 @@ class LinspaceOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };
 
 class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index c9b852cfc05127a4bbf00ea23a751c59dc2d109d..87d914aa79753fbdc9d859c43bbf749b3ddf95cf 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
-                   "Input(InTrue) or Input(InFalse) should be initialized.");
+    PADDLE_ENFORCE_EQ(
+        in_true.numel() || in_false.numel(), true,
+        platform::errors::InvalidArgument(
+            "Input(InTrue) or Input(InFalse) should be initialized."));
 
     auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
@@ -56,7 +58,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
                             cpu_mask.get());
 #else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Not supported GPU, Please recompile or reinstall paddle with CUDA "
+          "support."));
 #endif
     }
     auto *mask_data = cpu_mask->data<bool>();
@@ -109,7 +113,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       size_t start_offset = lod_and_offset.second.first;
       size_t end_offset = lod_and_offset.second.second;
 
-      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      PADDLE_ENFORCE_GE(end_offset, start_offset,
+                        platform::errors::InvalidArgument(
+                            "The end offset less than start offset, end offset "
+                            "is %d, start offset is %d.",
+                            end_offset, start_offset));
       size_t len = end_offset - start_offset;
       if (len == 0) {
         continue;
@@ -189,22 +197,24 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
                    "merge_lod_tensor");
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2,
-                      "If you are using IfElse OP:"
-                      "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                      "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                      "Please ensure that the cond should be a 2-D tensor and "
-                      "the second dim size of cond should be 1. "
-                      "But now the cond's shape is [",
-                      *mask_dim.Get(), "].\n");
+                      platform::errors::InvalidArgument(
+                          "If you are using IfElse OP:"
+                          "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                          "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                          "Please ensure that the cond is a 2-D tensor and "
+                          "the second dim size of cond is 1. "
+                          "But now the cond's shape is [%s].\n",
+                          mask_dim));
     if (context->IsRuntime() || mask_dim[1] > 0) {
       PADDLE_ENFORCE_EQ(mask_dim[1], 1,
-                        "If you are using IfElse OP:"
-                        "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                        "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                        "Please ensure that the cond should be a 2-D tensor "
-                        "and the second dim size of cond should be 1. "
-                        "But now the cond's shape is [",
-                        *mask_dim.Get(), "].\n");
+                        platform::errors::InvalidArgument(
+                            "If you are using IfElse OP:"
+                            "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                            "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                            "Please ensure that the cond is a 2-D tensor "
+                            "and the second dim size of cond is 1. "
+                            "But now the cond's shape is [%s].\n",
+                            mask_dim));
     }
 
     context->SetOutputDim("Out", context->GetInputDim("InTrue"));
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 5c6c38da92808f05c90e7dad2482e7c7364a1f80..eb41d21e09218b203f887d8fd812d46dc8367c71 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -23,46 +23,54 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
+                   "DecayedAdagradOp");
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Param").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Grad").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));
+
+    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
+                   "DecayedAdagradOp");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
+                      platform::errors::InvalidArgument(
+                          "LearningRate should have one element"));
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
-                      "Param and Grad input of DecayedAdagradOp should have "
-                      "the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
-                      "Param and Moment input of DecayedAdagradOp should have "
-                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        platform::errors::InvalidArgument(
+            "Param and Grad input of DecayedAdagradOp should have "
+            "the same dimension."));
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        platform::errors::InvalidArgument(
+            "Param and Moment input of DecayedAdagradOp should have "
+            "the same dimension."));
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("MomentOut", param_dims);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
index 279edfb015c26848d4078975a40bdca650bdc6a0..f264ebf8a32636a1e2076f8721b3c95d65f5382b 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -24,17 +24,19 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
     const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
index b579b5143ddbe6221738f9864f13fb7bea4ac509..55775bc08fb5ebc31cd231b8088a9798561fabfc 100755
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -30,7 +30,12 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
     auto* grad_var = ctx.InputVar("Grad");
     // only support dense for now.
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
     auto grad = ctx.Input<framework::LoDTensor>("Grad");
 
     param_out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index f20bada8ab288fe74fd8ca82a73522a22b234191..142b00b4de66caaedda5c4f0723d31e3a819b8a4 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -60,20 +60,33 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
   auto place = ctx.GetPlace();
 
   PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
-                    "src and dst tensor should have the same dims size.");
+                    platform::errors::InvalidArgument(
+                        "Source and destination tensor should have the same "
+                        "dimension size, but source tensor dimension size is "
+                        "%u, destination tensor size is %u.",
+                        src_stride_numel.size(), dst_stride_numel.size()));
 
   for (int64_t i = 0; i < axis; ++i) {
     if (i < axis) {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
-                        dst_stride_numel[i] / dst_stride_numel[axis],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i] / src_stride_numel[axis],
+          dst_stride_numel[i] / dst_stride_numel[axis],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i] / src_stride_numel[axis],
+              dst_stride_numel[i] / dst_stride_numel[axis]));
     } else if (i == axis) {
       continue;
     } else {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i], dst_stride_numel[i],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i], dst_stride_numel[i]));
     }
   }
 
@@ -90,7 +103,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
                    src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
 #else
-      PADDLE_THROW("Paddle is not compiled with GPU");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Paddle is not compiled with GPU."));
 #endif
     }
   }
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index f8a29a52d7a3d9332b9dcb8189dfd7c1df902faa..db8b2c30501bd7f291b23728a26dcd3ea27e0ec5 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -78,21 +78,35 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
       platform::errors::NotFound("Col(Output) of VarConv2dOP is not found."));
 
   auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of X(Input) can't be less than 2.");
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "The rank of X(Input) can't be less than 2, but received rank is %u.",
+          x_dims.size()));
 
   auto w_dims = ctx->GetInputDim("W");
 
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+  PADDLE_ENFORCE_EQ(
+      w_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "Input W should be a 2-D tensor, but its actual dimension is %u.",
+          w_dims.size()));
   int output_channel = ctx->Attrs().Get<int>("OutputChannel");
   int input_channel = ctx->Attrs().Get<int>("InputChannel");
   int kernel_h = ctx->Attrs().Get<int>("KernelH");
   int kernel_w = ctx->Attrs().Get<int>("KernelW");
-  PADDLE_ENFORCE_EQ(w_dims[0], output_channel,
-                    "W dim[0] should be equal to OutputChannel");
+  PADDLE_ENFORCE_EQ(
+      w_dims[0], output_channel,
+      platform::errors::InvalidArgument(
+          "Input W's dimension[0] should be equal to OutputChannel, the "
+          "dimension[0] is %d, OutputChannel is %d.",
+          w_dims[0], output_channel));
   PADDLE_ENFORCE_EQ(
       w_dims[1], input_channel * kernel_h * kernel_w,
-      "W dim[1] should be equal to InputChannel * StrideH * StrideW");
+      platform::errors::InvalidArgument(
+          "Input W's dimension[1] should be equal to InputChannel * StrideH * "
+          "StrideW, the dimension[1] is %d, expected value is %d.",
+          w_dims[1], input_channel * kernel_h * kernel_w));
 
   if (ctx->IsRuntime()) {
     framework::Variable* x_var =
@@ -103,10 +117,14 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
         platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP "
                                           "does not contain LoD information."));
 
-    PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod[0].back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
+    PADDLE_ENFORCE_GE(x_lod.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info is corrupted."));
+    PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()),
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info mismatches the actual "
+                          "tensor shape, input lod is %s, tensor shape is %s.",
+                          x_lod, x_dims));
 
     framework::Variable* row_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]);
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index 957bdf1e698d0aedb86c5b0cb732ab545c260bcc..a9382f2c8adcb18e320ef44086a312f89c03ad09 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -24,7 +24,11 @@ namespace platform {
 
 void CudaProfilerInit(std::string output_file, std::string output_mode,
                       std::string config_file) {
-  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
+                 platform::errors::InvalidArgument(
+                     "Unsupported cuda profiler output mode, expect `kvp` or "
+                     "`csv`, but received `%s`.",
+                     output_mode));
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 1166dc5e4ad93fa23ef00623de6777b78b56ea09..4c59fe5e9bae4b751d87b0d2feb1ea0bd02bcf1d 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -30,6 +30,10 @@ CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
 CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 #endif
@@ -54,6 +58,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R8
+CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
+#endif
+
 bool HasCUDNN() {
   std::call_once(cudnn_dso_flag,
                  []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index fba41417648ba606727d00e71f48766f47479989..dd0a2e1968501a3375ad5691b27fa0c922cf2ab4 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -134,6 +134,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);          \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
   __macro(cudnnSetRNNDescriptor);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 489dd198876204486fc94518fbef0c806d0543d4..da9900e2b271d08394cbc5e397f31b84e3b4d156 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -649,61 +649,47 @@ void BindImperative(py::module *m_ptr) {
              return self.NewVarBase(tensor.place(), false);
            },
            py::return_value_policy::copy, R"DOC(
-        **Notes**:
-            **This API is ONLY available in Dygraph mode**
 
-        Returns a new Variable, detached from the current graph.
-
-        Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+        Returns a new Tensor, detached from the current graph.
 
+        Returns: The detached Tensor.
 
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import Linear
-                import numpy as np
-
-                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
-                with fluid.dygraph.guard():
-                    linear = Linear(32, 64)
-                    data = to_variable(data)
-                    x = linear(data)
-                    y = x.detach()
+                import paddle
+                paddle.disable_static()
 
+                linear = Linear(32, 64)
+                data = paddle.uniform(shape=[30, 10, 32], -1, 1)
+                x = linear(data)
+                y = x.detach()
        )DOC")
       .def("clear_gradient", &imperative::VarBase::ClearGradient, R"DOC(
 
-        **Notes**:
-        **1. This API is ONLY available in Dygraph mode**
-
-        **2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
+        Only for Tensor that has gradient, normally we use this for Parameters since other temporary Tensor doesen't has gradient.
 
-        Clear  (set to ``0`` ) the Gradient of Current Variable
+        The Gradient of current Tensor will be set to ``0`` .
 
         Returns:  None
 
         Examples:
              .. code-block:: python
 
-                import paddle.fluid as fluid
-                import numpy as np
-
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                         tmp = fluid.dygraph.base.to_variable(x)
-                         tmp.stop_gradient=False
-                         inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    loss2.backward()
-                    print(loss2.gradient())
-                    loss2.clear_gradient()
-                    print("After clear {}".format(loss2.gradient()))
+                import paddle
+                paddle.disable_static()
+
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.ones([2, 2])
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs2)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
+                print("Before clear_gradient {}".format(loss.grad))
+                loss.clear_gradient()
+                print("After clear_gradient {}".format(loss.grad))
       )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 235d92ac4f9e88947cea04425b0916b8a0290979..d587081fbac8a27df18bdacba3d94f6adcd3b171 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -26,7 +26,7 @@ function(train_test TARGET_NAME)
                     ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
         endif()
         set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES DEPENDS test_${TARGET_NAME})
+                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
         if(NOT WIN32 AND NOT APPLE)
             set_tests_properties(test_train_${TARGET_NAME}${arg}
                     PROPERTIES TIMEOUT 150)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 1087f5672459506cc7b824127cd822c0df7ba566..1ef98720f83697715c05e868177faba489fd8760 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -29,7 +29,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -70,7 +72,8 @@ int main() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // init all parameters
   executor.Run(*startup_program, &scope, 0);
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
index d45edd563f03d7a1b156d063d5e7296290d0eaba..a08069a57ca824f307b4bf8836237f573ab3c429 100644
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
@@ -45,7 +45,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -98,7 +100,11 @@ int main(int argc, char* argv[]) {
       file_vec.push_back(filename);
     }
   }
-  PADDLE_ENFORCE_GE(file_vec.size(), 1, "At least one file to train");
+  PADDLE_ENFORCE_GE(
+      file_vec.size(), 1,
+      platform::errors::InvalidArgument(
+          "At least one file to train, but received number of file is %d.",
+          file_vec.size()));
   paddle::framework::InitDevices(false);
   const auto cpu_place = paddle::platform::CPUPlace();
   paddle::framework::Executor executor(cpu_place);
@@ -148,7 +154,9 @@ int main(int argc, char* argv[]) {
     const std::vector<paddle::framework::DataFeed*> readers =
         dataset_ptr->GetReaders();
     PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      "readers num should be equal to thread num");
+                      platform::errors::InvalidArgument(
+                          "Readers num(%d) should be equal to thread num(1).",
+                          readers.size()));
     readers[0]->SetPlace(paddle::platform::CPUPlace());
     const std::vector<std::string>& input_feed_names =
         readers[0]->GetUseSlotAlias();
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index 45c438e8925b4e0a88e61ad509b88cd6226773a4..e7b698e1a34e267e392d696b67b92cd2e8c23f3b 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -51,7 +51,8 @@ void Train() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // prepare data
   auto x_var = scope.Var("img");
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 60e4496bc545759173b68efbf85922efe8976fa4..524c086c07925c880dfb46a70a1f930686bae867 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,6 +26,7 @@ wmic process where name="op_function_generator.exe" call terminate  2>NUL
 rem ------initialize common variable------
 if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
+if not defined TENSORRT_ROOT set TENSORRT_ROOT="C:/TensorRT-5.1.5.0"
 if not defined WITH_MKL set WITH_MKL=ON
 if not defined WITH_GPU set WITH_GPU=OFF
 if not defined WITH_AVX set WITH_AVX=ON
@@ -33,9 +34,11 @@ if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
+if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
 if not defined WITH_CACHE set WITH_CACHE=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=ON
 
+
 rem -------set cache build work directory-----------
 if "%WITH_CACHE%"=="OFF" (
     rmdir build /s/q
@@ -99,6 +102,7 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
 clcache.exe -M 21474836480
 
+
 rem ------set cache third_party------
 set cache_dir=%work_dir:Paddle=cache%
 dir %cache_dir%
@@ -138,6 +142,7 @@ exit /b 1
 :CASE_wincheck_mkl
 set WITH_MKL=ON
 set WITH_GPU=OFF
+set MSVC_STATIC_CRT=ON
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -149,11 +154,13 @@ goto:success
 :CASE_wincheck_openblas
 set WITH_MKL=OFF
 set WITH_GPU=ON
+set MSVC_STATIC_CRT=OFF
 rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
 set WITH_INFERENCE_API_TEST=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
+:: call :test_inference || goto test_inference_error
 goto:success
 
 rem "Other configurations are added here"
@@ -172,12 +179,14 @@ set start=%start:~4,10%
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 
 cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 goto:eof
 
 :cmake_error
@@ -282,7 +291,9 @@ dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
 dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
-set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
+%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
+%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
 ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
@@ -305,7 +316,7 @@ set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "TestCases Total"
 
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 81d5908ccd4f859e36ae98a157fac2d37214c271..f66f013e4dbaadd534d6859b7ba6530779c82a3b 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -605,7 +605,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         """
         if not self._role_is_generated:
             self._generate_role()
-        return len(self._get_pserver_endpoints())
+        return len(self._get_pserver_endpoints(
+        )) if self._get_pserver_endpoints() is not None else 0
 
     def _node_num(self):
         """
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index ae5c53b8a37c4958e58ed5b09ce7cc8194f1ff52..6dd4661f00062f55bb834bbee50daf1924a0c87a 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -220,12 +220,12 @@ class ParameterServerRuntime(RuntimeBase):
         else:
             model_dirname = None
 
-        if self.role_maker._is_heter_worker():
-            self._init_worker()
-
         executor = self._get_executor()
         executor.run(fluid.default_startup_program())
 
+        if self.role_maker._is_heter_worker():
+            self._init_worker()
+
         if self.role_maker._is_heter_worker():
             return
 
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 7b564b3f837c001673bdd272ba60edf31cde21fb..ac6493b1c2969a8c3319bc8d29983b0ccc3a67d9 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 
 from paddle.fluid import core
+from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry
 
 from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_
@@ -57,7 +58,7 @@ __all__ = [
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
     'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
-    'correlation'
+    'correlation', 'fused_bn_add_act'
 ]
 
 
@@ -1625,3 +1626,191 @@ def correlation(x,
             },
             outputs={"Output": output})
     return output
+
+
+def fused_bn_add_act(x,
+                     y,
+                     momentum=0.9,
+                     epsilon=1e-05,
+                     param_attr=None,
+                     bias_attr=None,
+                     moving_mean_name=None,
+                     moving_variance_name=None,
+                     act=None,
+                     name=None):
+    """
+    This Op performs batch norm on input x, and adds the result to input y. Then
+    it performs activation on the sum. The data format of inputs must be NHWC
+    `[batch, in_height, in_width, in_channels]`.
+
+    Args:
+        x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        momentum(float|Tensor, optional): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+	        If the Initializer of the param_attr is not set, the parameter is initialized
+	        with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	        If the Initializer of the bias_attr is not set, the bias is initialized zero.
+	        Default: None.
+        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string.
+        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string.
+        act(string, optional): Activation type, linear|relu|prelu|...
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
+    Examples:
+            .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            def build_program(main_program, startup_program):
+                with fluid.program_guard(main_program, startup_program):
+                    x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+                    y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+                    conv1_1 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    conv1_2 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    bn = fluid.layers.batch_norm(
+                        input=conv1_1,
+                        act=None,
+                        data_layout='NHWC')
+                    fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(conv1_2, bn)
+                    prediction = fluid.layers.fc(input=fused_bn_add_act, size=10, act='softmax')
+                    loss = fluid.layers.cross_entropy(input=prediction, label=y)
+                    loss = fluid.layers.mean(loss)
+                    sgd = fluid.optimizer.SGD(learning_rate=0.001)
+                    sgd = fluid.contrib.mixed_precision.decorate(
+                        sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+                    sgd.minimize(loss)
+
+                return x, y, loss
+
+            iters = 5
+            batch_size = 16
+            support_gpu = fluid.is_compiled_with_cuda()
+            if support_gpu:
+                main_program = fluid.Program()
+                startup_program = fluid.Program()
+                place = fluid.CUDAPlace(0)
+                x, y, loss = build_program(main_program, startup_program)
+  
+                feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+                train_reader = paddle.batch(
+                    paddle.dataset.mnist.train(), batch_size=batch_size)
+                exe = fluid.Executor(place)
+                scope = fluid.Scope()
+                with fluid.scope_guard(scope):
+                    exe.run(startup_program)
+                    for _ in range(iters):
+                        data = next(train_reader())
+                        loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
+    """
+    helper = LayerHelper('fused_bn_add_act', **locals())
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    check_variable_and_dtype(y, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    bn_param_dtype = core.VarDesc.VarType.FP32
+
+    x_shape = x.shape
+    channel_num = x_shape[-1]
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        default_initializer=Constant(1.0))
+    bias = helper.create_parameter(
+        attr=helper.bias_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        is_bias=True)
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    mean.stop_gradient = True
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    reserve_space = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(
+        core.VarDesc.VarType.FP16)
+
+    inputs = {
+        "X": x,
+        "Z": y,
+        "Scale": scale,
+        "Bias": bias,
+    }
+    attrs = {"epsilon": epsilon, 'momentum': momentum}
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance,
+        "ReserveSpace": reserve_space
+    }
+
+    helper.append_op(
+        type="fused_bn_add_activation",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+
+    return batch_norm_out
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 1f301b7148d005d4e3d5d272fd78f78af6dc1e6a..a9f080c514dff078b0068bce262fa177fd0b0db2 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -135,6 +135,7 @@ gray_list = {
     'get_tensor_from_selected_rows',
     'sign',
     'cast',
+    'fused_bn_add_activation',
 }
 '''
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 0b142ff33de55f36410eb9c23cb75210fc9d6321..0ff166d8dc89ac79c36343df9bc379cb171c36fd 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -69,8 +69,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
     ]
 
     for in_name in op.input_names:
-        if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm':
-            if in_name != 'X':
+        if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm', 'fused_bn_add_activation'
+        ]:
+            if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
             in_var = block.var(in_var_name)
@@ -102,7 +104,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                     op._set_attr('in_dtype', dest_dtype)
     if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
         for out_name in op.output_names:
-            if op.type == 'batch_norm' and out_name != 'Y':
+            if op.type in ['batch_norm', 'fused_bn_add_activation'
+                           ] and out_name != 'Y':
                 continue
             for out_var_name in op.output(out_name):
                 out_var = block.var(out_var_name)
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 3aa7b9dfc262810686319819f717f3cfd06b5e50..68206f62860852b1124b65da0e4124f60a2a8051 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 from .. import core
 from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
-from ..layers import common_methods
-from . import to_variable, no_grad
+from . import no_grad
 
 import numpy as np
 import six
@@ -53,47 +52,25 @@ def monkey_patch_math_varbase():
 
     def astype(self, dtype):
         """
-        **Notes**:
-            **The variable must be a** :ref:`api_fluid_Tensor`
 
-        Cast a variable to a specified data type.
+        Cast a Tensor to a specified data type.
 
         Args:
-
-            self(Variable): The source variable
-
-            dtype: The target data type
+            dtype: The target data type.
 
         Returns:
-            Variable: Variable with new dtype
+            Tensor: a new Tensor with target dtype
 
         Examples:
-            In Static Graph Mode:
-
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                startup_prog = fluid.Program()
-                main_prog = fluid.Program()
-                with fluid.program_guard(startup_prog, main_prog):
-                    original_variable = fluid.data(name = "new_variable", shape=[2,2], dtype='float32')
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}".format(new_variable.dtype))
-
-            In Dygraph Mode:
-
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
                 import numpy as np
 
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    original_variable = fluid.dygraph.to_variable(x)
-                    print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype))
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}, numpy dtype is {}".format(new_variable.dtype, new_variable.numpy().dtype))
+                original_tensor = paddle.ones([2, 2])
+                print("original tensor's dtype is: {}".format(original_tensor.dtype))
+                new_tensor = original_tensor.astype('float32')
+                print("new tensor's dtype is: {}".format(new_tensor.dtype))
 
         """
         if not isinstance(dtype, core.VarDesc.VarType):
@@ -147,6 +124,10 @@ def monkey_patch_math_varbase():
     def _ndim_(var):
         return len(var.shape)
 
+    @property
+    def _size_(var):
+        return np.prod(var.shape)
+
     def _scalar_add_(var, value):
         return _scalar_elementwise_op_(var, 1.0, value)
 
@@ -208,7 +189,6 @@ def monkey_patch_math_varbase():
         __impl__.__doc__ = """
         {0}
         Args:
-            self(Tensor): left hand Tensor
             other_var(Tensor|float|int): right hand Tensor
 
         Returns:
@@ -217,23 +197,7 @@ def monkey_patch_math_varbase():
         __impl__.__name__ = method_name
         return __impl__
 
-    # Todo(zhouwei): implement dygraph template to adapt to any function, receive('op_type', 'arg_template')
-    #  Such as _method_creator_('addmm', 'x, y, alpha=1.0, beta=1.0, name=None'). It can reduce call time.
-    def _method_creator_(op_type, arg_template=None):
-        def __impl__(self):
-            op = getattr(core.ops, op_type)
-            return op(self)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     varbase_methods = [
-        # Type1: From custom fun or lambda
-        ##   b=-a
         ('__neg__', _neg_),
         ('__float__', _float_),
         ('__long__', _long_),
@@ -244,8 +208,7 @@ def monkey_patch_math_varbase():
         ('dim', lambda x: len(x.shape)),
         ('ndimension', lambda x: len(x.shape)),
         ('ndim', _ndim_),
-        ('size', lambda x: x.shape),
-        # Type2: From Template that create core.ops automatically. It's recommended.
+        ('size', _size_),
         ('__add__',
          _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
         ##  a+b == b+a. Do not need to reverse explicitly
@@ -283,31 +246,7 @@ def monkey_patch_math_varbase():
         ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
         ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
-        ('__array_ufunc__', None),
-        ('sigmoid', _method_creator_('sigmoid', 'name=None')),
-        ('log_sigmoid', _method_creator_('logsigmoid', 'name=None')),
-        ('exp', _method_creator_('exp', 'name=None')),
-        ('tanh', _method_creator_('tanh', 'name=None')),
-        ('atan', _method_creator_('atan', 'name=None')),
-        ('tanh_shrink', _method_creator_('tanh_shrink', 'name=None')),
-        ('sqrt', _method_creator_('sqrt', 'name=None')),
-        ('rsqrt', _method_creator_('rsqrt', 'name=None')),
-        ('abs', _method_creator_('abs', 'name=None')),
-        ('ceil', _method_creator_('ceil', 'name=None')),
-        ('floor', _method_creator_('floor', 'name=None')),
-        ('cos', _method_creator_('cos', 'name=None')),
-        ('acos', _method_creator_('acos', 'name=None')),
-        ('asin', _method_creator_('asin', 'name=None')),
-        ('sin', _method_creator_('sin', 'name=None')),
-        ('sinh', _method_creator_('sinh', 'name=None')),
-        ('cosh', _method_creator_('cosh', 'name=None')),
-        ('round', _method_creator_('round', 'name=None')),
-        ('reciprocal', _method_creator_('reciprocal', 'name=None')),
-        ('square', _method_creator_('square', 'name=None')),
-        ('softplus', _method_creator_('softplus', 'name=None')),
-        ('softsign', _method_creator_('softsign', 'name=None')),
-        # Type3: Form module 'paddle.tensor' defaultly.
-        #   It's not a goodway, because it will increase call time.
+        ('__array_ufunc__', None)
     ]
 
     global _already_patch_varbase
@@ -318,7 +257,15 @@ def monkey_patch_math_varbase():
             setattr(core.VarBase, method_name, method_impl)
     else:
         import paddle.tensor
-        for method_name in common_methods:
+        # Tensor method from module paddle.tensor
+        tensor_methods = paddle.tensor.linalg.__all__ + \
+                         paddle.tensor.math.__all__ + \
+                         paddle.tensor.logic.__all__ + \
+                         paddle.tensor.manipulation.__all__ + \
+                         paddle.tensor.search.__all__ + \
+                         paddle.tensor.stat.__all__ + \
+                         paddle.tensor.attribute.__all__
+        for method_name in tensor_methods:
             if hasattr(core.VarBase, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
             if method_impl: setattr(core.VarBase, method_name, method_impl)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index 236cb458be4c6a07f768761b41464e64d4d53f77..e556a98ed7504b199624deeac10ea594efa269b4 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -191,12 +191,14 @@ class FleetTranspiler(Fleet):
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
+
         self._communicator.init_with_ctx(send_ctx, recv_ctx)
 
         if not self._communicator.is_running():
             self._communicator.start()
         else:
-            warnings.warn("communicator has been initialized, skip")
+            raise ValueError(
+                "Communicator can only be inited once, please check")
 
     def init_worker(self):
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
index 05deff10a2e1c914e9725c7d8697a704db6e7e42..a60c4e149f582e4f364910611d18cda5fbca4f07 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -624,6 +624,7 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
         value_dims = []
         grad = None
         opt_idx = -1
+        fuse = False
 
         for op in block.ops:
             opt_idx += 1
@@ -631,6 +632,9 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
             if op.type not in opt_value_map.keys():
                 continue
 
+            if op.type in ["sgd", "adam"]:
+                fuse = True
+
             grad = main_program.global_block().vars[op.input("Grad")[0]]
 
             for value in opt_value_map[op.type]:
@@ -644,7 +648,67 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
 
             if value_names:
                 break
-        return grad, opt_idx, value_names, value_dims, acture_names
+        return grad, opt_idx, value_names, value_dims, acture_names, fuse
+
+    def add_fuse_large_scale_op(block, global_block, table_name, value_names,
+                                acture_names, grad, is_entry, opt_idx):
+
+        op = block.ops[opt_idx]
+
+        if op.type == "sgd":
+            grad = main_program.global_block().vars[op.input("Grad")[0]]
+            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
+
+            block._insert_op(
+                opt_idx,
+                type="lookup_sparse_table_fuse_sgd",
+                inputs={"Grad": grad,
+                        "LearningRate": lr},
+                attrs={
+                    "is_entry": is_entry,
+                    "tablename": table_name,
+                    "value_names": value_names
+                })
+
+        elif op.type == "adam":
+            grad = main_program.global_block().vars[op.input("Grad")[0]]
+            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
+            beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[
+                0]]
+            beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[
+                0]]
+            beta1_pow_o = main_program.global_block().vars[op.output(
+                "Beta1PowOut")[0]]
+            beta2_pow_o = main_program.global_block().vars[op.output(
+                "Beta2PowOut")[0]]
+
+            beta1 = op.attr('beta1')
+            beta2 = op.attr('beta2')
+            epsilon = op.attr('epsilon')
+
+            block._insert_op(
+                opt_idx,
+                type="lookup_sparse_table_fuse_adam",
+                inputs={
+                    "Grad": grad,
+                    "LearningRate": lr,
+                    "Beta1Pow": beta1_pow,
+                    "Beta2Pow": beta2_pow
+                },
+                outputs={
+                    "Beta1PowOut": beta1_pow_o,
+                    "Beta2PowOut": beta2_pow_o
+                },
+                attrs={
+                    "beta1": beta1,
+                    "beta2": beta2,
+                    "epsilon": epsilon,
+                    "is_entry": is_entry,
+                    "tablename": table_name,
+                    "value_names": value_names
+                })
+        else:
+            raise ValueError("only support sgd/adam optimizer now")
 
     def add_large_scale_op(block, global_block, table_name, value_names,
                            acture_names, grad, is_entry, opt_idx):
@@ -711,24 +775,35 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
         for param, blockid in param_blockid_map.items():
             opt_block = program.block(blockid)
 
-            grad, opt_idx, value_names, value_dims, acture_names = \
+            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                 get_optimizer_values(opt_block)
 
             entry_attr = get_entry_attr(param)
             is_entry = False if entry_attr == "none" else True
-            add_large_scale_op(opt_block,
-                               program.global_block(), param, value_names,
-                               acture_names, grad, is_entry, opt_idx)
 
+            if fuse:
+                add_fuse_large_scale_op(opt_block,
+                                        program.global_block(), param,
+                                        value_names, acture_names, grad,
+                                        is_entry, opt_idx)
+            else:
+                add_large_scale_op(opt_block,
+                                   program.global_block(), param, value_names,
+                                   acture_names, grad, is_entry, opt_idx)
     else:
         large_scale_kv_metas = []
         for param, blockid in param_blockid_map.items():
             opt_block = main_program.block(blockid)
-            grad, _, value_names, value_dims, acture_names = \
+
+            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                 get_optimizer_values(opt_block)
 
             entry_attr = get_entry_attr(param)
 
+            if fuse:
+                # remove origin optimzier op
+                opt_block._remove_op(opt_idx)
+
             # training/infer
             mode = "0"
             names_str = ",".join(value_names)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 4543af9820e8c9326098fa254494ca1c896d3b12..3f826da3ae2beca51b639a69da4113e6d9580d6c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -227,22 +227,6 @@ def init_from_server_pass(program, config):
     fetch_barrier_out = program.global_block().create_var(
         name=framework.generate_control_dev_var_name())
 
-    recv_ctx = config.get_communicator_recv_context(recv_type=1)
-    recv_varnames = []
-
-    for name, ctxs in recv_ctx.items():
-        recv_varnames.extend(ctxs.origin_varnames())
-
-    program.global_block().append_op(
-        type="recv",
-        inputs={"X": []},
-        outputs={"Out": []},
-        attrs={
-            "recv_varnames": recv_varnames,
-            "trainer_id": config.get_role_id(),
-            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-        })
-
     program.global_block().append_op(
         type="fetch_barrier",
         inputs={},
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
index 60378aa98272dae32a97b33e84fc61e71193658c..06a90b78fd2e53d065f1abbaf9e95df848f9cc52 100644
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
@@ -164,8 +164,8 @@ def train(args):
     elif fleet.is_worker():
         logger.info("run trainer")
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         thread_num = 2
         filelist = []
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 4595f0cf93916d71a3d0ec582af1917500d68f12..92b58a7e2ee4c76af7047a14f67e40d76be76dc0 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -54,29 +54,6 @@ EXPRESSION_MAP = {
     "__ge__": "A >= B"
 }
 
-# method for Tensor from paddle.tensor
-# edit it when paddle.tensor has new method about Tensor operation
-common_methods = [
-    'exp', 'tanh', 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', 'cos',
-    'acos', 'asin', 'sin', 'sinh', 'cosh', 'round', 'reciprocal', 'square',
-    'rank', 'matmul', 'dot', 'norm', 'transpose', 'dist', 't', 'cross',
-    'cholesky', 'bmm', 'histogram', 'equal', 'greater_equal', 'greater_than',
-    'is_empty', 'isfinite', 'less_equal', 'less_than', 'logical_and',
-    'logical_not', 'logical_or', 'logical_xor', 'not_equal', 'reduce_all',
-    'reduce_any', 'allclose', 'equal_all', 'cast', 'expand', 'expand_as',
-    'tile', 'flatten', 'gather', 'gather_nd', 'reshape', 'reverse', 'scatter',
-    'scatter_nd_add', 'scatter_nd', 'shard_index', 'slice', 'split', 'squeeze',
-    'strided_slice', 'unique', 'unique_with_counts', 'unsqueeze', 'flip',
-    'unbind', 'roll', 'cumsum', 'increment', 'log', 'pow', 'reciprocal',
-    'round', 'rsqrt', 'scale', 'sign', 'stanh', 'sum', 'reduce_prod', 'max',
-    'min', 'mm', 'div', 'multiply', 'add', 'logsumexp', 'log1p', 'erf',
-    'addcmul', 'addmm', 'clamp', 'trace', 'kron', 'argmax', 'argmin', 'argsort',
-    'has_inf', 'has_nan', 'topk', 'index_select', 'nonzero', 'sort',
-    'index_sample', 'mean', 'std', 'var', 'elementwise_add', 'elementwise_div',
-    'elementwise_floordiv', 'elementwise_mod', 'elementwise_pow',
-    'elementwise_sub'
-]
-
 _already_patch_variable = False
 
 
@@ -372,7 +349,14 @@ def monkey_patch_variable():
             setattr(Variable, method_name, method_impl)
     else:
         import paddle.tensor
-        for method_name in common_methods:
+        variabel_methods = paddle.tensor.linalg.__all__ + \
+                           paddle.tensor.math.__all__ + \
+                           paddle.tensor.logic.__all__ + \
+                           paddle.tensor.manipulation.__all__ + \
+                           paddle.tensor.search.__all__ + \
+                           paddle.tensor.stat.__all__ + \
+                           paddle.tensor.attribute.__all__
+        for method_name in variabel_methods:
             if hasattr(Variable, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
             if method_impl: setattr(Variable, method_name, method_impl)
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index cf52f3b00fb2739d186021dc51d6aa0f506be706..2fba578ec077f2a74388d433bf3ab5b3098e81ad 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -1453,11 +1453,14 @@ def linspace(start, stop, num, dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
-        tensor_start = fill_constant([1], dtype, start)
+        with device_guard("cpu"):
+            tensor_start = fill_constant([1], dtype, start)
     if not isinstance(stop, Variable):
-        tensor_stop = fill_constant([1], dtype, stop)
+        with device_guard("cpu"):
+            tensor_stop = fill_constant([1], dtype, stop)
     if not isinstance(num, Variable):
-        tensor_num = fill_constant([1], 'int32', num)
+        with device_guard("cpu"):
+            tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
         return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
                                  dtype)
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 673c965b662a022739f8d489c331f4de9455a926..96321aae566d1f910042f4e348d0be8b3e88c341 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -4,4 +4,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 # default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
+    set_tests_properties(${src} PROPERTIES FIXTURES_SETUP ${src}_infer_model)
 endforeach()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 8277499fcce341207fa75a74dfda0a2ccc2e3b63..5721445c414cf94379f44cab6bd01cca511938bf 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -163,8 +163,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
+
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
@@ -202,8 +204,8 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         thread_num = 2
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 0e3c80992771424e4216a79b991de1c62884c757..3852b225234ffacc2be749245fb1341331868272 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -60,8 +60,9 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(device_id)
         exe = fluid.Executor(place)
-        fleet.init_worker()
+
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
@@ -104,8 +105,8 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
         place = fluid.CUDAPlace(device_id)
         exe = fluid.Executor(place)
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         thread_num = 2
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 2f938a813d8a7598e49023066759a490eab53263..470fb98d7991cf0cbffa47f6d5129b045f59ae97 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -152,8 +152,9 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
@@ -176,8 +177,8 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 2ea69e1b6763087bb2b278b59a8a59b4331847da..ff84848873924c52b0f7e8f5bc71ec2a266b73f1 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -222,8 +222,8 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
         batch_size = 4
         # reader
         train_reader = paddle.batch(fake_simnet_reader(), batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index 77697896b4d556da8a98c17e281b3d7a6999fd64..81530573a604205f0202d088853038bbc71b92e6 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -151,8 +151,9 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
+
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         batch_size = 4
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
index 8d2677229a03f7bdac14a93e176747ba0a5f1d6b..ab1127afa58dd93aa92688eebdf82292990f59b1 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -47,7 +47,7 @@ class TestSimpleRNNCell(unittest.TestCase):
         prev_h = np.random.randn(4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_with_zero_state(self):
@@ -57,7 +57,7 @@ class TestSimpleRNNCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def runTest(self):
@@ -90,7 +90,7 @@ class TestGRUCell(unittest.TestCase):
         prev_h = np.random.randn(4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_with_zero_state(self):
@@ -100,7 +100,7 @@ class TestGRUCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def runTest(self):
@@ -134,8 +134,8 @@ class TestLSTMCell(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
         y2, (h2, c2) = rnn2(
-            paddle.to_variable(x),
-            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+            paddle.to_tensor(x),
+            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -146,7 +146,7 @@ class TestLSTMCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, (h1, c1) = rnn1(x)
-        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index ef297b3bb62497073fd667238cae8a83daaa4967..7c03b51837ef6f7be8021dca55daf3b43f7d3053 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -53,7 +53,7 @@ class TestSimpleRNN(unittest.TestCase):
         prev_h = np.random.randn(2 * self.num_directions, 4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -66,7 +66,7 @@ class TestSimpleRNN(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -81,11 +81,11 @@ class TestSimpleRNN(unittest.TestCase):
 
         y1, h1 = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
@@ -133,7 +133,7 @@ class TestGRU(unittest.TestCase):
         prev_h = np.random.randn(2 * self.num_directions, 4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -146,7 +146,7 @@ class TestGRU(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -161,11 +161,11 @@ class TestGRU(unittest.TestCase):
 
         y1, h1 = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
@@ -209,8 +209,8 @@ class TestLSTM(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
         y2, (h2, c2) = rnn2(
-            paddle.to_variable(x),
-            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+            paddle.to_tensor(x),
+            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
@@ -224,7 +224,7 @@ class TestLSTM(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, (h1, c1) = rnn1(x)
-        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
@@ -240,11 +240,11 @@ class TestLSTM(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_async.py b/python/paddle/fluid/tests/unittests/test_communicator_async.py
index d032d6d75b5b3a48ea1e752190952f4c52e23b07..a86b80b2cf98829a683045ae302f72a694809138 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
@@ -30,11 +30,10 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
 
 class TestCommunicator(unittest.TestCase):
     def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        cost = fluid.layers.square_error_cost(input=x, label=y)
         avg_cost = fluid.layers.mean(cost)
         return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index d9fc9262b311f949a1a89cd079517c5c93d0d28d..5916000fba79fc0da2ef545beac634a3edfe01df 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -83,8 +83,8 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
         feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index 391588780f342dc17ea821334e28f941f9ce359a..b0f55f2939dc94af603f4cc5851dbb5e6317774f 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -71,8 +71,8 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
         feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index c0044d9d620796057cce0e3a51b2dec2878a0e17..95b209b14602676a089a667b0a720056bbe1562b 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -27,11 +27,9 @@ import paddle.distributed.fleet as fleet
 
 class TestCommunicator(unittest.TestCase):
     def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        cost = fluid.layers.square_error_cost(input=x, label=y)
         avg_cost = fluid.layers.mean(cost)
         return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index 780d57b53310bb5f385a131d4ad52dd6f5e695f0..ddf1240e4ef27775a24cee540c5f193399112270 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -119,6 +119,16 @@ class TestDiagV2API(unittest.TestCase):
             (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
                 self.padding_value * np.ones(n))
 
+        self.input_np4 = np.random.random(size=(2000, 2000)).astype(np.float32)
+        self.expected6 = np.diag(self.input_np4)
+        self.expected7 = np.diag(self.input_np4, k=1)
+        self.expected8 = np.diag(self.input_np4, k=-1)
+
+        self.input_np5 = np.random.random(size=(2000)).astype(np.float32)
+        self.expected9 = np.diag(self.input_np5)
+        self.expected10 = np.diag(self.input_np5, k=1)
+        self.expected11 = np.diag(self.input_np5, k=-1)
+
     def run_imperative(self):
         x = paddle.to_tensor(self.input_np)
         y = paddle.diag(x)
@@ -141,10 +151,32 @@ class TestDiagV2API(unittest.TestCase):
         y = paddle.diag(x, padding_value=-8)
         self.assertTrue(np.allclose(y.numpy(), self.expected5))
 
+        x = paddle.to_tensor(self.input_np4)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected6))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected7))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected8))
+
+        x = paddle.to_tensor(self.input_np5)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected9))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected10))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected11))
+
     def run_static(self, use_gpu=False):
         x = paddle.data(name='input', shape=[10, 10], dtype='float32')
         x2 = paddle.data(name='input2', shape=[100], dtype='float64')
         x3 = paddle.data(name='input3', shape=[100], dtype='int64')
+        x4 = paddle.data(name='input4', shape=[2000, 2000], dtype='float32')
+        x5 = paddle.data(name='input5', shape=[2000], dtype='float32')
         result0 = paddle.diag(x)
         result1 = paddle.diag(x, offset=1)
         result2 = paddle.diag(x, offset=-1)
@@ -152,17 +184,28 @@ class TestDiagV2API(unittest.TestCase):
         result4 = paddle.diag(x2, padding_value=8)
         result5 = paddle.diag(x3, padding_value=8.0)
         result6 = paddle.diag(x3, padding_value=-8)
+        result7 = paddle.diag(x4)
+        result8 = paddle.diag(x4, offset=1)
+        result9 = paddle.diag(x4, offset=-1)
+        result10 = paddle.diag(x5)
+        result11 = paddle.diag(x5, offset=1)
+        result12 = paddle.diag(x5, offset=-1)
 
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        res0, res1, res2, res4, res5, res6 = exe.run(
+        res0, res1, res2, res4, res5, res6, res7, res8, res9, res10, res11, res12 = exe.run(
             feed={
                 "input": self.input_np,
                 "input2": self.input_np2,
-                'input3': self.input_np3
+                'input3': self.input_np3,
+                'input4': self.input_np4,
+                'input5': self.input_np5
             },
-            fetch_list=[result0, result1, result2, result4, result5, result6])
+            fetch_list=[
+                result0, result1, result2, result4, result5, result6, result7,
+                result8, result9, result10, result11, result12
+            ])
 
         self.assertTrue(np.allclose(res0, self.expected0))
         self.assertTrue(np.allclose(res1, self.expected1))
@@ -171,6 +214,12 @@ class TestDiagV2API(unittest.TestCase):
         self.assertTrue(np.allclose(res4, self.expected3))
         self.assertTrue(np.allclose(res5, self.expected4))
         self.assertTrue(np.allclose(res6, self.expected5))
+        self.assertTrue(np.allclose(res7, self.expected6))
+        self.assertTrue(np.allclose(res8, self.expected7))
+        self.assertTrue(np.allclose(res9, self.expected8))
+        self.assertTrue(np.allclose(res10, self.expected9))
+        self.assertTrue(np.allclose(res11, self.expected10))
+        self.assertTrue(np.allclose(res12, self.expected11))
 
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index a82612b0ed2a6700dd157ddd6263cae2a879c274..7f55e956a94aee79dda07762e953e71807899bff 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -44,16 +44,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
@@ -71,7 +66,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
                 sends += 1
             if op.type == "sgd":
                 sgds += 1
-        self.assertEqual(sends, 7)
+        self.assertEqual(sends, 1)
         self.assertEqual(sgds, 0)
 
         fleet.init_worker()
@@ -89,16 +84,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index b05a53c88bb9154b69640df6c39305a00e3c447b..db3f2afb3668bc1831286f8d13b274895e7632fd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -36,16 +36,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
     def test_gradient_merge_optimizer(self):
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
@@ -63,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
                 sends += 1
             if op.type == "sgd":
                 sgds += 1
-        self.assertEqual(sends, 6)
+        self.assertEqual(sends, 0)
         self.assertEqual(sgds, 0)
 
         fleet.init_worker()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index 379bcaf684d53c2c72f6369e72418cdaaaf3ac84..6fe52ba9fe61ad83341ece5c29fcafa89095de82 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -70,15 +70,13 @@ class TestPSPassWithBow(unittest.TestCase):
         q = fluid.layers.data(
             name="query_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        q_emb = fluid.layers.embedding(
+        q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
         # vsum
         q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
@@ -97,15 +95,13 @@ class TestPSPassWithBow(unittest.TestCase):
         pt = fluid.layers.data(
             name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        pt_emb = fluid.layers.embedding(
+        pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
         # vsum
         pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
@@ -123,15 +119,13 @@ class TestPSPassWithBow(unittest.TestCase):
         nt = fluid.layers.data(
             name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        nt_emb = fluid.layers.embedding(
+        nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
         # vsum
         nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
@@ -167,7 +161,7 @@ class TestPSPassWithBow(unittest.TestCase):
 
         fleet.init(role)
         loss, acc, _ = self.net()
-        optimizer = fluid.optimizer.SGD(base_lr)
+        optimizer = fluid.optimizer.Adam(base_lr)
         strategy = StrategyFactory.create_async_strategy()
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index fd069793473648a0dff731d66c85bd3fe61997c7..c570c4d8cd01dd7e7b113b1f5f35c9887f4a4376 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -168,12 +168,13 @@ class TestPSPassWithBow(unittest.TestCase):
         fleet.init(role)
         loss, acc, _ = self.net()
 
-        optimizer = fluid.optimizer.SGD(
+        optimizer = fluid.optimizer.Adagrad(
             learning_rate=fluid.layers.exponential_decay(
                 learning_rate=base_lr,
                 decay_steps=500,
                 decay_rate=0.969,
                 staircase=True))
+
         strategy = StrategyFactory.create_async_strategy()
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1284e3ce316114122d5bbeb6d88cbabc3f160
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -0,0 +1,168 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        endpoints = [
+            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
+            "127.0.0.1:36007"
+        ]
+
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_num=2,
+            server_endpoints=endpoints)
+
+        fleet.init(role)
+        loss, acc, _ = self.net()
+        optimizer = fluid.optimizer.Adagrad(base_lr)
+        strategy = StrategyFactory.create_async_strategy()
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bca91c536ba32b05138f2860c13fdd1899a2e011
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+class TestLookupTableFuseOp(unittest.TestCase):
+    def test_fuse(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = fluid.global_scope()
+        scope.var("LearningRate").get_tensor().set([0.01], place)
+        scope.var("Ids").get_tensor().set([i for i in range(100)], place)
+
+        init_program = fluid.Program()
+
+        lr = init_program.global_block().create_var(
+            name="LearningRate",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        ids = init_program.global_block().create_var(
+            name="Ids",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[100],
+            dtype="int64")
+
+        output = init_program.global_block().create_var(
+            name="output",
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[100, 8],
+            dtype="float32")
+
+        metas = []
+        metas.append(
+            "embedding_1.block0:Param,Moment1,Moment2:8,8,8:0:embedding_1@GRAD.block0:embedding_1.block0,embedding_1_moment1_0,embedding_1_moment2_0,kSparseIDs@embedding_1.block0:uniform_random&0&-0.5&0.5,fill_constant&0.0,fill_constant&0.0:none"
+        )
+        metas.append(
+            "embedding_2.block0:Param:8:0:embedding_2@GRAD.block0:embedding_2.block0,kSparseIDs@embedding_2.block0:uniform_random&0&-0.5&0.5:none"
+        )
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_init",
+            inputs=None,
+            outputs=None,
+            attrs={"large_scale_metas": metas})
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_read",
+            inputs={"Ids": ids},
+            outputs={"Out": output},
+            attrs={
+                "tablename": "embedding_1.block0",
+                "init": True,
+                "value_names": ["Param"],
+            })
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_read",
+            inputs={"Ids": ids},
+            outputs={"Out": output},
+            attrs={
+                "tablename": "embedding_2.block0",
+                "init": True,
+                "value_names": ["Param"],
+            })
+
+        executor = fluid.Executor(place)
+        executor.run(init_program)
+
+        training_program = fluid.Program()
+
+        scope.var('Beta1Pow').get_tensor().set(
+            np.array([0]).astype("float32"), place)
+        scope.var('Beta2Pow').get_tensor().set(
+            np.array([0]).astype("float32"), place)
+
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 8
+        w_selected_rows = scope.var('Grad').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        lr = training_program.global_block().create_var(
+            name="LearningRate",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        grads = training_program.global_block().create_var(
+            name="Grad",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
+            shape=[100, 8],
+            dtype="float32")
+
+        beta1 = training_program.global_block().create_var(
+            name="Beta1Pow",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        beta2 = training_program.global_block().create_var(
+            name="Beta2Pow",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        training_program.global_block().append_op(
+            type="lookup_sparse_table_fuse_adam",
+            inputs={
+                "Grad": grads,
+                "LearningRate": lr,
+                "Beta1Pow": beta1,
+                "Beta2Pow": beta2,
+            },
+            outputs={"Beta1PowOut": beta1,
+                     "Beta2PowOut": beta2},
+            attrs={
+                "is_entry": False,
+                "tablename": "embedding_1.block0",
+                "value_names": ["Param", "Moment1", "Moment2"],
+            })
+
+        training_program.global_block().append_op(
+            type="lookup_sparse_table_fuse_sgd",
+            inputs={"Grad": grads,
+                    "LearningRate": lr},
+            attrs={
+                "is_entry": False,
+                "tablename": "embedding_2.block0",
+                "value_names": ["Param"],
+            })
+
+        executor.run(training_program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bc305cd1f4dcd3faaaf8ccbe813bdf08e966d6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle core is not compiled with CUDA")
+class TestFusedBnAddActAPI(unittest.TestCase):
+    def setUp(self):
+        self.conv_param_attr1 = fluid.ParamAttr(
+            name='conv2d_1.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.conv_param_attr2 = fluid.ParamAttr(
+            name='conv2d_2.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.bn_param_attr1 = fluid.ParamAttr(
+            name='batch_norm_w_1',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr1 = fluid.ParamAttr(
+            name='batch_norm_b_1',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.bn_param_attr2 = fluid.ParamAttr(
+            name='batch_norm_w_2',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr2 = fluid.ParamAttr(
+            name='batch_norm_b_2',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.fc_param_attr = fluid.ParamAttr(
+            name='fc.weight',
+            initializer=fluid.initializer.Xavier(uniform=False))
+
+    def build_fused_program(self,
+                            main_program,
+                            startup_program,
+                            use_cuda,
+                            seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(
+                conv1_2,
+                bn,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2)
+            prediction = fluid.layers.fc(input=fused_bn_add_act,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def build_origin_program(self,
+                             main_program,
+                             startup_program,
+                             use_cuda,
+                             seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn1 = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            bn2 = fluid.layers.batch_norm(
+                input=conv1_2,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2,
+                act=None,
+                data_layout='NHWC')
+            out = bn1 + bn2
+            out = fluid.layers.relu(out)
+            prediction = fluid.layers.fc(input=out,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        iters = 5
+        batch_size = 16
+
+        # build_fused_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_fused_program(main_program, startup_program,
+                                              use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        exe = fluid.Executor(place)
+        loss_vals_fused = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals_fused.append(loss_v[0][0])
+
+        # build_origin_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_origin_program(main_program, startup_program,
+                                               use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        loss_vals = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals.append(loss_v[0][0])
+
+        # check loss
+        for i in range(iters):
+            self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
+
+    def test_fuse_bn_add_act(self):
+        place = fluid.CUDAPlace(0)
+        self.check(place, use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index a70862f40197c513a0cd04753553264708ee2a1c..5df04ddfc3d26492323153b8b26658db4325b7ec 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -19,6 +19,7 @@ import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
+import inspect
 
 
 class TestMathOpPatchesVarBase(unittest.TestCase):
@@ -302,21 +303,13 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertEqual(x.dim(), 2)
         self.assertEqual(x.ndimension(), 2)
         self.assertEqual(x.ndim, 2)
-        self.assertEqual(x.size(), [2, 3])
-        self.assertTrue(
-            np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
-            )))
-        self.assertTrue(
-            np.array_equal(x.log_sigmoid().numpy(),
-                           fluid.layers.logsigmoid(x).numpy()))
+        self.assertEqual(x.size, 6)
+        self.assertEqual(x.numel(), 6)
         self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
         self.assertTrue(
             np.array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()))
         self.assertTrue(
             np.array_equal(x.atan().numpy(), paddle.atan(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.tanh_shrink().numpy(),
-                           fluid.layers.tanh_shrink(x).numpy()))
         self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy()))
         m = x.abs()
         self.assertTrue(
@@ -344,12 +337,6 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             )))
         self.assertTrue(
             np.array_equal(x.square().numpy(), paddle.square(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.softplus().numpy(),
-                           fluid.layers.softplus(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.softsign().numpy(),
-                           fluid.layers.softsign(x).numpy()))
         self.assertTrue(
             np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
         self.assertTrue(
@@ -422,6 +409,8 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertTrue(np.array_equal(x.reciprocal(), paddle.reciprocal(x)))
 
         # 2. Binary operation
+        self.assertTrue(
+            np.array_equal(x.divide(y).numpy(), paddle.divide(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
                 x.matmul(y, True, False).numpy(),
@@ -501,6 +490,73 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(
                 x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+        a = paddle.to_tensor([[1, 2], [3, 4]])
+        b = paddle.to_tensor([[4, 3], [2, 1]])
+        self.assertTrue(
+            np.array_equal(
+                x.where(a, b).numpy(), paddle.where(x, a, b).numpy()))
+
+        self.assertTrue(inspect.ismethod(a.dot))
+        self.assertTrue(inspect.ismethod(a.elementwise_add))
+        self.assertTrue(inspect.ismethod(a.elementwise_div))
+        self.assertTrue(inspect.ismethod(a.elementwise_floordiv))
+        self.assertTrue(inspect.ismethod(a.elementwise_mod))
+        self.assertTrue(inspect.ismethod(a.elementwise_sub))
+        self.assertTrue(inspect.ismethod(a.logsumexp))
+        self.assertTrue(inspect.ismethod(a.multiplex))
+        self.assertTrue(inspect.ismethod(a.prod))
+        self.assertTrue(inspect.ismethod(a.reduce_max))
+        self.assertTrue(inspect.ismethod(a.reduce_min))
+        self.assertTrue(inspect.ismethod(a.reduce_prod))
+        self.assertTrue(inspect.ismethod(a.reduce_sum))
+        self.assertTrue(inspect.ismethod(a.scale))
+        self.assertTrue(inspect.ismethod(a.stanh))
+        self.assertTrue(inspect.ismethod(a.sums))
+        self.assertTrue(inspect.ismethod(a.elementwise_sum))
+        self.assertTrue(inspect.ismethod(a.max))
+        self.assertTrue(inspect.ismethod(a.maximum))
+        self.assertTrue(inspect.ismethod(a.min))
+        self.assertTrue(inspect.ismethod(a.minimum))
+        self.assertTrue(inspect.ismethod(a.floor_divide))
+        self.assertTrue(inspect.ismethod(a.remainder))
+        self.assertTrue(inspect.ismethod(a.floor_mod))
+        self.assertTrue(inspect.ismethod(a.multiply))
+        self.assertTrue(inspect.ismethod(a.logsumexp))
+        self.assertTrue(inspect.ismethod(a.inverse))
+        self.assertTrue(inspect.ismethod(a.log1p))
+        self.assertTrue(inspect.ismethod(a.erf))
+        self.assertTrue(inspect.ismethod(a.addcmul))
+        self.assertTrue(inspect.ismethod(a.addmm))
+        self.assertTrue(inspect.ismethod(a.clip))
+        self.assertTrue(inspect.ismethod(a.trace))
+        self.assertTrue(inspect.ismethod(a.kron))
+        self.assertTrue(inspect.ismethod(a.isinf))
+        self.assertTrue(inspect.ismethod(a.isnan))
+        self.assertTrue(inspect.ismethod(a.concat))
+        self.assertTrue(inspect.ismethod(a.broadcast_to))
+        self.assertTrue(inspect.ismethod(a.scatter_nd_add))
+        self.assertTrue(inspect.ismethod(a.scatter_nd))
+        self.assertTrue(inspect.ismethod(a.shard_index))
+        self.assertTrue(inspect.ismethod(a.chunk))
+        self.assertTrue(inspect.ismethod(a.stack))
+        self.assertTrue(inspect.ismethod(a.strided_slice))
+        self.assertTrue(inspect.ismethod(a.unsqueeze))
+        self.assertTrue(inspect.ismethod(a.unstack))
+        self.assertTrue(inspect.ismethod(a.argmax))
+        self.assertTrue(inspect.ismethod(a.argmin))
+        self.assertTrue(inspect.ismethod(a.argsort))
+        self.assertTrue(inspect.ismethod(a.has_inf))
+        self.assertTrue(inspect.ismethod(a.has_nan))
+        self.assertTrue(inspect.ismethod(a.masked_select))
+        self.assertTrue(inspect.ismethod(a.topk))
+        self.assertTrue(inspect.ismethod(a.index_select))
+        self.assertTrue(inspect.ismethod(a.nonzero))
+        self.assertTrue(inspect.ismethod(a.sort))
+        self.assertTrue(inspect.ismethod(a.index_sample))
+        self.assertTrue(inspect.ismethod(a.mean))
+        self.assertTrue(inspect.ismethod(a.reduce_mean))
+        self.assertTrue(inspect.ismethod(a.std))
+        self.assertTrue(inspect.ismethod(a.numel))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_minimum_op.py b/python/paddle/fluid/tests/unittests/test_minimum_op.py
index 4c08b7386ca2c5da04c0a289872dacf68a2ea040..a0673c82c5b341e550485ebdcee4e4616693d641 100644
--- a/python/paddle/fluid/tests/unittests/test_minimum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
@@ -61,8 +61,8 @@ class ApiMinimumTest(unittest.TestCase):
     def test_dynamic_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         z = paddle.minimum(x, y)
         np_z = z.numpy()
         z_expected = np.array(np.minimum(self.input_x, self.input_y))
@@ -73,8 +73,8 @@ class ApiMinimumTest(unittest.TestCase):
         np_x = np.random.rand(5, 4, 3, 2).astype("float64")
         np_y = np.random.rand(4, 3).astype("float64")
 
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         result_1 = paddle.minimum(x, y, axis=1)
         result_2 = paddle.minimum(x, y, axis=-2)
         self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index 753d96c44114a552f4bdd299602d7f13f672efbf..e327307e955308e78f6e9640681c842060a34882 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -205,8 +205,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'mean')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'mean')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -240,8 +239,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'sum')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'sum')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -275,8 +273,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'none')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'none')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py
index e7154193beaf788a9d20f3c131b1df3420918266..c07bf949af39e38222b05394f65977c7027e2f13 100644
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
@@ -909,8 +909,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, )).astype(np.float64)
                 label_np = np.random.randint(0, 10, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 nll_loss = paddle.nn.loss.NLLLoss()
                 res = nll_loss(x, label)
 
@@ -933,8 +933,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, 3)).astype(np.float64)
                 label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 nll_loss = paddle.nn.loss.NLLLoss(reduction='')
                 res = nll_loss(x, label)
 
@@ -957,8 +957,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, 3)).astype(np.float64)
                 label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 res = paddle.nn.functional.nll_loss(x, label, reduction='')
 
         self.assertRaises(
diff --git a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
index 0ebe769fb9bce1aee8412ccebc216c2c85e97775..8ee3b2ac20320c3b82eb7bb81509a9a84ce959a7 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
@@ -101,9 +101,9 @@ def create_test_case(margin, reduction):
 
         def run_dynamic_functional_api(self, place):
             paddle.disable_static(place)
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(self.label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(self.label_data)
 
             result = paddle.nn.functional.margin_ranking_loss(x, y, label,
                                                               margin, reduction)
@@ -117,9 +117,9 @@ def create_test_case(margin, reduction):
 
         def run_dynamic_api(self, place):
             paddle.disable_static(place)
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(self.label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(self.label_data)
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
@@ -134,9 +134,9 @@ def create_test_case(margin, reduction):
         def run_dynamic_broadcast_api(self, place):
             paddle.disable_static(place)
             label_data = np.random.choice([-1, 1], size=[10]).astype("float64")
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(label_data)
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
index d52a1f5d5b16ca7e0d58230a1a17624e5bff0b02..90132a0923df716e9e2a0224671006cb62c1bba0 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -56,7 +56,7 @@ class TestNNSigmoidAPI(unittest.TestCase):
 
     def check_dynamic_api(self, place):
         paddle.disable_static(place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         mysigmoid = nn.Sigmoid()
         y = mysigmoid(x)
         self.assertTrue(np.allclose(y.numpy(), self.y))
@@ -94,7 +94,7 @@ class TestNNFunctionalSigmoidAPI(unittest.TestCase):
 
     def check_dynamic_api(self):
         paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         y = functional.sigmoid(x)
         self.assertTrue(np.allclose(y.numpy(), self.y))
 
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index 8512bc99e7451c73e5513b834fb6aa448717c646..800706e5965dffedadb61c384d946c8ed28bf826 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -76,8 +76,8 @@ class TestNumelOoAPI(unittest.TestCase):
         paddle.disable_static(paddle.CPUPlace())
         input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
         input_2 = np.random.random([1, 4, 5]).astype("int32")
-        x_1 = paddle.to_variable(input_1)
-        x_2 = paddle.to_variable(input_2)
+        x_1 = paddle.to_tensor(input_1)
+        x_2 = paddle.to_tensor(input_2)
         out_1 = paddle.numel(x_1)
         out_2 = paddle.numel(x_2)
         assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index c1e6a3377710f98184e9541e287b911def89cd81..bb0d6f07bdbde18d155b66c7d014503747ebd887 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -63,7 +63,7 @@ class TestOnesLikeImpeartive(unittest.TestCase):
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
-        x = paddle.to_variable(np.ones(shape))
+        x = paddle.to_tensor(np.ones(shape))
         for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
             out = ones_like(x, dtype)
             self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
index baf0efa6ec2e7edafb8d331423a7b47155283c21..cf138e67726163d3d1c990a180fa229b88fed99f 100644
--- a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -48,8 +48,8 @@ def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
 
 def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
     paddle.disable_static()
-    x = paddle.to_variable(x_np)
-    y = paddle.to_variable(y_np)
+    x = paddle.to_tensor(x_np)
+    y = paddle.to_tensor(y_np)
     dist = paddle.nn.layer.distance.PairwiseDistance(
         p=p, epsilon=epsilon, keepdim=keepdim)
     distance = dist(x, y)
diff --git a/python/paddle/fluid/tests/unittests/test_sort_op.py b/python/paddle/fluid/tests/unittests/test_sort_op.py
index 015b72fd1c5275f758a109451110f61b97c4a0c7..366e0c7a3fa3ee714162e6041aa0d52dbfb30746 100644
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
@@ -72,14 +72,14 @@ class TestSortDygraph(unittest.TestCase):
 
     def test_api_0(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
+        var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x)
         self.assertEqual((np.sort(self.input_data) == out.numpy()).all(), True)
         paddle.enable_static()
 
     def test_api_1(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
+        var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x, axis=-1)
         self.assertEqual(
             (np.sort(
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index 5aaf31993448ab0ff0c69f648cfa84c62d3e198b..b0f065a26a006ee3553a84938fb5b6b2db7b3172 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -230,13 +230,13 @@ class TestTileAPI(unittest.TestCase):
     def test_api(self):
         with fluid.dygraph.guard():
             np_x = np.random.random([12, 14]).astype("float32")
-            x = paddle.to_variable(np_x)
+            x = paddle.to_tensor(np_x)
 
             positive_2 = np.array([2]).astype("int32")
-            positive_2 = paddle.to_variable(positive_2)
+            positive_2 = paddle.to_tensor(positive_2)
 
             repeat_times = np.array([2, 3]).astype("int32")
-            repeat_times = paddle.to_variable(repeat_times)
+            repeat_times = paddle.to_tensor(repeat_times)
 
             out_1 = paddle.tile(x, repeat_times=[2, 3])
             out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index bd76edc9d8cadf14c6cf224b7708ff4acd6efef4..7c7a71a3be1b508c850048c3945f29ef7424654c 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -234,23 +234,23 @@ class TestTransformer(unittest.TestCase):
                 if cache_dict:
                     if 'k' and 'v' in cache_dict:
                         cache_obj = multi_head_attn.Cache(
-                            paddle.to_variable(cache_dict['k']),
-                            paddle.to_variable(cache_dict['v']))
+                            paddle.to_tensor(cache_dict['k']),
+                            paddle.to_tensor(cache_dict['v']))
                     elif 'static_k' and 'static_v' in cache_dict:
                         cache_obj = multi_head_attn.StaticCache(
-                            paddle.to_variable(cache_dict['static_k']),
-                            paddle.to_variable(cache_dict['static_v']))
+                            paddle.to_tensor(cache_dict['static_k']),
+                            paddle.to_tensor(cache_dict['static_v']))
                 if attn_mask is not None:
                     attn_output = multi_head_attn(
-                        paddle.to_variable(query),
-                        paddle.to_variable(key),
-                        paddle.to_variable(value),
-                        paddle.to_variable(attn_mask), cache_obj)
+                        paddle.to_tensor(query),
+                        paddle.to_tensor(key),
+                        paddle.to_tensor(value),
+                        paddle.to_tensor(attn_mask), cache_obj)
                 else:
                     attn_output = multi_head_attn(
-                        paddle.to_variable(query),
-                        paddle.to_variable(key),
-                        paddle.to_variable(value), attn_mask, cache_obj)
+                        paddle.to_tensor(query),
+                        paddle.to_tensor(key),
+                        paddle.to_tensor(value), attn_mask, cache_obj)
                 attn_output = attn_output[0] if cache_dict else attn_output
 
                 # implementation by numpy
@@ -296,16 +296,16 @@ class TestTransformer(unittest.TestCase):
                 attn_dropout, act_dropout)
 
             encoder_output = encoder_layer(
-                paddle.to_variable(src),
-                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
+                paddle.to_tensor(src),
+                paddle.to_tensor(src_mask))  # paddle.to_tensor(src_mask))
             # 4.numpy:
             # paddle self attention
             self_attn = MultiHeadAttention(
                 d_model, n_head, dropout=attn_dropout)
             attn_output = self_attn(
-                paddle.to_variable(src),
-                paddle.to_variable(src),
-                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()
+                paddle.to_tensor(src),
+                paddle.to_tensor(src),
+                paddle.to_tensor(src), paddle.to_tensor(src_mask)).numpy()
 
             src = attn_output + residual
             src_norm = layer_norm(src, d_model, encoder_layer.norm1)
@@ -348,13 +348,13 @@ class TestTransformer(unittest.TestCase):
                 cache_objs = None
                 if cache:
                     cache_objs = decoder_layer.gen_cache(
-                        paddle.to_variable(memory))
+                        paddle.to_tensor(memory))
 
                 decoder_output = decoder_layer(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(tgt_mask),
-                    paddle.to_variable(memory_mask), cache_objs)
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(tgt_mask),
+                    paddle.to_tensor(memory_mask), cache_objs)
 
                 decoder_output = decoder_output[0].numpy(
                 ) if cache else decoder_output.numpy()
@@ -365,10 +365,10 @@ class TestTransformer(unittest.TestCase):
                 self_attn_cache = cache_objs[
                     0] if cache_objs is not None else None
                 tgt = self_attn(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt_mask), self_attn_cache)
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt_mask), self_attn_cache)
 
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
@@ -380,10 +380,10 @@ class TestTransformer(unittest.TestCase):
                 cross_attn_cache = cache_objs[
                     1] if cache_objs is not None else None
                 tgt = cross_attn(
-                    paddle.to_variable(tgt_norm),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(memory_mask), cross_attn_cache)
+                    paddle.to_tensor(tgt_norm),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(memory_mask), cross_attn_cache)
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
                 # postprocess
@@ -416,7 +416,7 @@ class TestTransformer(unittest.TestCase):
             encoder = TransformerEncoder(encoder_layer, num_layers)
             # src, src_mask
             enc_output = encoder(
-                paddle.to_variable(src), paddle.to_variable(src_mask))
+                paddle.to_tensor(src), paddle.to_tensor(src_mask))
 
     def test_decoder(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -438,9 +438,9 @@ class TestTransformer(unittest.TestCase):
             decoder = TransformerDecoder(decoder_layer, num_layers)
 
             output = decoder(
-                paddle.to_variable(tgt),
-                paddle.to_variable(memory),
-                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))
+                paddle.to_tensor(tgt),
+                paddle.to_tensor(memory),
+                paddle.to_tensor(tgt_mask), paddle.to_tensor(memory_mask))
 
     def test_transformer(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -453,24 +453,24 @@ class TestTransformer(unittest.TestCase):
                 n_head,
                 dim_feedforward=dim_feedforward,
                 dropout=dropout)
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 6bc42f0712a1a8c9f9a0640e06042c42e7cc948f..c4155e0d8260fe1fdc4a0e49e955fc2bbff0fc89 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -424,10 +424,10 @@ class TestCTCLossAPICase(unittest.TestCase):
         loss_np = ctc.forward()
 
         paddle.disable_static()
-        softmax = paddle.to_variable(logits)
-        labels = paddle.to_variable(labels)
-        logits_length = paddle.to_variable(self.logits_length)
-        labels_length = paddle.to_variable(self.labels_length)
+        softmax = paddle.to_tensor(logits)
+        labels = paddle.to_tensor(labels)
+        logits_length = paddle.to_tensor(self.logits_length)
+        labels_length = paddle.to_tensor(self.labels_length)
         loss_pd_mean = F.ctc_loss(
             softmax,
             labels,
@@ -477,10 +477,10 @@ class TestCTCLossAPICase(unittest.TestCase):
         loss_np = ctc.forward()
 
         paddle.disable_static()
-        softmax = paddle.to_variable(logits)
-        labels = paddle.to_variable(labels)
-        logits_length = paddle.to_variable(self.logits_length)
-        labels_length = paddle.to_variable(self.labels_length)
+        softmax = paddle.to_tensor(logits)
+        labels = paddle.to_tensor(labels)
+        logits_length = paddle.to_tensor(self.logits_length)
+        labels_length = paddle.to_tensor(self.labels_length)
 
         loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
             softmax, labels, logits_length, labels_length)
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c388301ec3408e436eacb2567e8e529d0bbc03bb
--- /dev/null
+++ b/python/paddle/inference/__init__.py
@@ -0,0 +1,16 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.inference import Config, DataType, PlaceType, PrecisionType, Tensor, \
+    Predictor, create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 91a2a78203cbc50fec27b4f3ae8d3541ac4ec5da..8ee4d73ea847ea116ea4401b5b05ef1b925950fe 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -62,6 +62,22 @@ def cache(reader):
 
     Returns:
         generator: a decorated reader object which yields data from cached memory.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # All data is cached into memory
+            cached_reader = paddle.io.cache(reader)
+            
+            # Output: 0 1 2
+            for i in cached_reader():
+                print(i)
     """
     all_data = tuple(reader())
 
@@ -296,12 +312,28 @@ def buffered(reader, size):
     buffer. Reading from the buffered data reader will proceed as long
     as the buffer is not empty.
 
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param size: max buffer size.
-    :type size: int
+    Args:
+        reader(generator): the data reader to read from.
+        size(int): max buffer size.
+
+    Returns:
+        generator: the buffered data reader.
+    
+    Examples:
+        .. code-block:: python
 
-    :returns: the buffered data reader.
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # Create a buffered reader, and the buffer size is 2.
+            buffered_reader = paddle.io.buffered(reader, 2)
+            
+            # Output: 0 1 2
+            for i in buffered_reader():
+                print(i)
     """
 
     class EndSignal():
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9de407841fb461713d00f997afdf33a38a531245..dc6a04a4723bd92dbe1c76fce5b3e52981136211 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -53,7 +53,7 @@ __all__ = [
     'shard_index',
     'slice',
     'split',
-    'chunk'
+    'chunk',
     'squeeze',
     'stack',
     'strided_slice',
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index e1bc65a5d15c2883e14d20c5e06c2ee3cd726ea5..6fb73b08c11b417332b064df7408e78ed390cc2f 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -8,6 +8,10 @@ foreach(TEST_OP ${DIST_TEST_OPS})
     list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
+# disable test_pretrained_model and test_vision_models
+list(REMOVE_ITEM TEST_OPS test_pretrained_model)
+list(REMOVE_ITEM TEST_OPS test_vision_models)
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/setup.py.in b/python/setup.py.in
index d85a23a5edd31f77514b468731097759f47533c1..467c5cb86779b80e51794cf800226d64534e8676 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -155,6 +155,7 @@ packages=['paddle',
           'paddle.distributed.fleet.utils',
           'paddle.framework',
           'paddle.jit',
+          'paddle.inference',
           'paddle.fluid',
           'paddle.fluid.inference',
           'paddle.fluid.dygraph',
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index 1858bd0fd17aac7273318ddbb37fc0d9c512f48d..c1e2903c092ce4124c55566679e081dbe3a03445 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -57,7 +57,14 @@ FILE_WHITE_LIST="\
     random_crop_op.h \
     elementwise_op_function.cu.h \
     fused_elemwise_activation_op.cc \
-    auc_op.cu"
+    auc_op.cu \
+    unsqueeze_op.h \
+    unsqueeze_op.cc \
+    enforce.h \
+    errors_test.cc \
+    cross_entropy.cu \
+    cross_entropy.h \
+    unpooling.cu"
 
 function count_file_recursively(){
     dir_name=$1
diff --git a/tools/wlist.json b/tools/wlist.json
index 20f6a9cbaedb391995b3757612ec24f2061a8a81..5591f90da4ba807871663e56fe4e3b11bf2fbd8f 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -105,8 +105,6 @@
         "convert_dist_to_sparse_program",
         "load_persistables_for_increment",
         "load_persistables_for_inference",
-        "cache",
-        "buffered",
         "xmap_readers",
         "Metric.reset",
         "Metric.update",