diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb796103350ac4403d4151cf08eb4315bcde68fd..b1554fba5e1fa48b5cbdfe2e5b9f317a4f7fefb3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,8 +63,29 @@ if(WIN32)
         set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
         set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
         set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+        foreach(flag_var
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+            if(${flag_var} MATCHES "/MD")
+                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+            endif()
+        endforeach(flag_var)
     endif()
-    
+
+    # windows build turn off warnings.
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
+    endforeach(flag_var)
+
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     message(STATUS "Using parallel compiling (/MP)")
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index af5dd0e2c9b2d19929f58363d08e7ff40d43b013..351ef1c7c7aebb698a5d41689352a913d0b950e8 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-  set(CompilerFlags
-        CMAKE_CXX_FLAGS
-        CMAKE_CXX_FLAGS_DEBUG
-        CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_C_FLAGS
-        CMAKE_C_FLAGS_DEBUG
-        CMAKE_C_FLAGS_RELEASE
-        )
-  foreach(CompilerFlag ${CompilerFlags})
-    string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
-  endforeach()
 ELSE(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)
 
 set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
                         -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
                         -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
                         -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                         -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 415e07c75425345f5f1ad29a8544e02a5bfb12e4..ed0bf8396b3faa22350811cf1711f5d1e5b89998 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
     endif()
 endmacro()
 
-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()
 
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -229,20 +215,3 @@ endforeach()
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 
-
-if(WIN32)
-    # windows build turn off warnings.
-    if(MSVC_STATIC_CRT)
-        safe_set_static_flag()
-    endif()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b0a6dfe29020781e57d57861137861366864abdb..a23862653677d6d92f244503ac3bd95c101e91bf 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -446,6 +446,9 @@ function(nv_library TARGET_NAME)
         message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(nv_library_SRCS)
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
   endif()
 endfunction(nv_library)
 
@@ -461,6 +464,9 @@ function(nv_binary TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
       common_link(${TARGET_NAME})
     endif()
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
   endif()
 endfunction(nv_binary)
 
@@ -482,6 +488,9 @@ function(nv_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
   endif()
 endfunction(nv_test)
 
@@ -712,6 +721,7 @@ function(proto_library TARGET_NAME)
   set(proto_hdrs)
   paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
+  add_dependencies(extern_xxhash ${TARGET_NAME})
 endfunction()
 
 function(py_proto_compile TARGET_NAME)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e3c2409f103d36befed29176b354f77257fea9ec..f4603051a0e7e9f206d5344fd948f7750a09c173 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -19,12 +19,11 @@ set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
 set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
   "A path setting paddle inference shared and static libraries")
   
-# TODO(zhaolong)
-# At present, the size of static lib in Windows exceeds the system limit,
-# so the generation of static lib is temporarily turned off.
+# At present, the size of static lib in Windows is very large,
+# so we need to crop the library size.
 if(WIN32)
     #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
+    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
     if(NOT PYTHON_EXECUTABLE)
         FIND_PACKAGE(PythonInterp REQUIRED)
     endif()
@@ -165,25 +164,22 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
     if(WITH_STATIC_LIB)
-        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib)
+        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
     else()
         set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
-                            ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
     endif()
+    copy(inference_lib_dist
+            SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
+            DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
+            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 else(WIN32)
     set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
-endif(WIN32)
-
-if(WIN32 AND NOT WITH_STATIC_LIB)
-        copy(inference_lib_dist
-                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
-                      ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-else()
-        copy(inference_lib_dist
+    copy(inference_lib_dist
                 SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-endif()
+endif(WIN32)
 
 copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -199,7 +195,11 @@ set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_insta
 copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
+if(WIN32)
+  set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
+else(WIN32)
+  set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
+endif(WIN32)
 
 copy(inference_lib_dist
       SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
@@ -211,12 +211,12 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
 
 set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
-if(WIN32 AND NOT WITH_STATIC_LIB)
+if(WIN32)
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
                 )
-else()
+        else()
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} 
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 7dfe60f9dd8f021facba6925a465cb58bc5de25d..902dfb11fc0afa40ea5bd2b36543b2432a3bf384 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -26,4 +26,7 @@ if(WITH_GPU)
     set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
 endif()
 
+if(WIN32)
+    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
+endif()
 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index aea972ab3db2af862f5230ea6c1eabeed8b611c5..21080fbe8fd2e14cf7fd805e01948f2f28535c22 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,8 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"fused_bn_add_activation_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
new file mode 100644
index 0000000000000000000000000000000000000000..7e434c6d907cc40733a81d8a7cdbe7c285a2bd41
--- /dev/null
+++ b/cmake/paddle_win.props
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+    <ItemDefinitionGroup>
+        <CudaCompile>
+            <!-- Project schema: Host properties -->
+            <UseHostDefines>true</UseHostDefines>
+            <Emulation>false</Emulation>
+            <HostDebugInfo Condition="'$(Configuration)' == 'Debug'">true</HostDebugInfo>
+            <HostDebugInfo Condition="'$(Configuration)' != 'Debug'">false</HostDebugInfo>
+            <FastMath>false</FastMath>
+            <Optimization>InheritFromHost</Optimization>
+            <Runtime>InheritFromHost</Runtime>
+            <RuntimeChecks>InheritFromHost</RuntimeChecks>
+            <TypeInfo>InheritFromHost</TypeInfo>
+            <Warning>InheritFromHost</Warning>
+
+            <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env</BuildCommandLineTemplate>
+            <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
+            <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
+            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
+            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
+
+            <DriverApiCommandLineTemplate>%(BaseCommandLineTemplate) [CompileOut] "%(FullPath)"</DriverApiCommandLineTemplate>
+            <RuntimeApiCommandLineTemplate>%(BaseCommandLineTemplate) [HostDebugInfo] [Emulation] [FastMath] [Defines] %(HostCommandLineTemplate) [CompileOut] "%(FullPath)"</RuntimeApiCommandLineTemplate>
+
+            <CommandLineTemplate>
+# (Approximate command-line.  Settings inherited from host are not visible below.)
+# (Please see the output window after a build for the full command-line)
+
+# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
+set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
+"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(DriverApiCommandLineTemplate)
+
+# Runtime API (NVCC Compilation Type is hybrid object or .c file)
+set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
+"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(RuntimeApiCommandLineTemplate)
+            </CommandLineTemplate>
+            <ExecutionDescription>Compiling CUDA source file %(Identity)...</ExecutionDescription>
+            <ExclusionDescription>Skipping CUDA source file %(Identity) (excluded from build).</ExclusionDescription>
+
+            <!-- Miscellaneous -->
+            <PropsCacheOutputFile>%(Filename)%(Extension).cache</PropsCacheOutputFile>
+            <PropsCacheOutputPath>$(IntDir)%(PropsCacheOutputFile)</PropsCacheOutputPath>
+
+            <CudaCompileCoreProject>$(MSBuildProjectFullPath)</CudaCompileCoreProject>
+        </CudaCompile>
+
+        <CudaLink>
+            <PerformDeviceLink>true</PerformDeviceLink>
+            <LinkOut>$(IntDir)$(TargetName).device-link.obj</LinkOut>
+
+            <AdditionalLibraryDirectories></AdditionalLibraryDirectories>
+            <UseHostLibraryDirectories>true</UseHostLibraryDirectories>
+            <AdditionalDependencies></AdditionalDependencies>
+            <UseHostLibraryDependencies>true</UseHostLibraryDependencies>
+
+            <GPUDebugInfo>InheritFromProject</GPUDebugInfo>
+            <Optimization>InheritFromProject</Optimization>
+
+            <!-- Implicitly inherited from the project via @(CudaCompile) -->
+            <CodeGeneration></CodeGeneration>
+            <RuntimeChecks></RuntimeChecks>
+            <Runtime></Runtime>
+            <TargetMachinePlatform></TargetMachinePlatform>
+            <TypeInfo></TypeInfo>
+            <Warning></Warning>
+
+            <Inputs></Inputs>
+
+            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
+            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
+            <LinkCommandLineTemplate>"$(CudaToolkitNvccPath)" -dlink [LinkOut] %(HostCommandLineTemplate) [AdditionalLibraryDirectories] [AdditionalDependencies] [AdditionalOptions] [CodeGeneration] [GPUDebugInfo] [TargetMachinePlatform] [Inputs]</LinkCommandLineTemplate>
+            <CommandLineTemplate>
+# (Approximate command-line.  Settings inherited from host are not visible below.)
+# (Please see the output window after a build for the full command-line)
+
+%(LinkCommandLineTemplate)
+            </CommandLineTemplate>
+        </CudaLink>
+
+        <Link>
+            <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+        </Link>
+
+        <ClCompile>
+            <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+        </ClCompile>
+    </ItemDefinitionGroup>
+</Project>
+
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 9edfcb967abc26a25a94d368298c1c475295019f..1eb2096af91dc99ac22b000d2de269bde2efcbbf 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -39,6 +39,7 @@ set(third_party_deps)
 #            REPOSITORY ${TARGET_REPOSITORY}
 #            TAG        ${TARGET_TAG}
 #            DIR        ${TARGET_SOURCE_DIR})
+
 FUNCTION(cache_third_party TARGET)
     SET(options "")
     SET(oneValueArgs URL REPOSITORY TAG DIR)
@@ -269,6 +270,10 @@ if(WITH_PSLIB)
     endif()
 endif(WITH_PSLIB)
 
+if(NOT WIN32 AND NOT APPLE)
+    include(external/gloo)
+    list(APPEND third_party_deps extern_gloo)
+endif()
 
 if(WITH_BOX_PS)
     include(external/box_ps)
@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)
 
 if(WITH_DISTRIBUTE)
-    if(WITH_GLOO)
-        include(external/gloo)
-        list(APPEND third_party_deps extern_gloo)
-    endif()
 
     if(WITH_GRPC)
         list(APPEND third_party_deps extern_grpc)
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index 9ca3fe31a33c78621b9e25acaf095e8240af7db6..7460686c1a383531191377cf56ceefa6fbb26a5f 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/attribute.h"
 
-#include <vector>
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 5c6e421516269a9b9865605400efa772f944a96f..8c8fcadb05be080699fb6f7a6ab2ac3bc6328c09 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -30,6 +30,8 @@ namespace paddle {
 namespace framework {
 
 class ProgramDesc;
+class OpDesc;
+class VarDesc;
 
 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
index 0dd2768ccb9ffa1dc7b85dca500095f8c10479c3..48181dac662272679556c444b3055fc950b8c74d 100644
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/c/c_api.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/init.h"
 
 extern "C" {
 
diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h
index 04dbfbebe5d7646e93678fff3418c4eed6bc691a..a9ec402f381e43b51887b6467d8d1baccf98ad37 100644
--- a/paddle/fluid/framework/c/c_api.h
+++ b/paddle/fluid/framework/c/c_api.h
@@ -24,6 +24,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class OpInfoMap;
+}  // namespace framework
+namespace platform {
+class DeviceContextPool;
+}  // namespace platform
+}  // namespace paddle
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 64a645bf8b28a84b153db74afd2ec268d79ff245..503f1513aad20c1598b034ef1fb90bd0aa1a0224 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -277,7 +277,7 @@ class ChannelObject {
     size_t finished = 0;
     while (finished < n && WaitForWrite(lock)) {
       size_t m =
-          std::min(n - finished, capacity_ + reading_count_ - data_.size());
+          (std::min)(n - finished, capacity_ + reading_count_ - data_.size());
       for (size_t i = 0; i < m; i++) {
         data_.push_back(std::move(p[finished++]));
       }
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 9350c387a6e49122c6b4c3fe01f5de782a1ae5e4..5b89166e2f4826fe836c74a570abcb8a1790f58e 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cstring>
 #include <random>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h
index 8ff97646cfce7979b1c9c570e6de4f1bd26916c3..60b52a5e7069fb8fdd0664bcfa99eaa000dbff12 100644
--- a/paddle/fluid/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
@@ -21,6 +21,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Tensor;
+
 void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
                      Tensor* out);
 
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index b48d152fe35826363a77104a5cbe39ad800b5eb1..da156bfc5c79f6faf7544794675c40dfd10e7349 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -41,6 +41,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/string/string_helper.h"
 
+namespace paddle {
+namespace framework {
+class DataFeedDesc;
+class LoDTensor;
+class Scope;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
@@ -418,6 +427,7 @@ class MultiSlotType {
 
   std::string DebugString() {
     std::stringstream ss;
+
     ss << "\ntype: " << type_ << "\n";
     ss << "offset: ";
     ss << "[";
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index 1d8aec7624043f96c5aae908c5faa536e818b5a5..048d539f9b9e5039b7b3b5ec2ae672032f89be48 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/framework/data_feed.h"
-
 namespace paddle {
 namespace framework {
+class DataFeed;
+
 typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
 typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
 data_feedMap g_data_feed_map;
diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h
index 13678edb0b8d084a0b3016d93f6e1bc32ce0169a..49381a98706ddedb9bb0b5900fcb85ac4d4a3719 100644
--- a/paddle/fluid/framework/data_feed_factory.h
+++ b/paddle/fluid/framework/data_feed_factory.h
@@ -16,10 +16,13 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_feed.h"
 
 namespace paddle {
 namespace framework {
+class DataFeed;
+
 class DataFeedFactory {
  public:
   static std::string DataFeedTypeList();
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index f757e244e38ec965d62d673e63ed082ca70c63c7..108cd9ac6d1c0778b7f614116b5739502fcfb0ee 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+
 #include <string>
-#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index b92c47c2eb018603e1b3156921fb2c1702864c57..238f2d2e67914c7ae1443d09cf915439ebad4dd5 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -17,10 +17,18 @@
 #include <map>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class OpKernelType;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index 8dfad23db65178c46140b887811846e413bebd00..20443e9a3dcad62770e62d9a780960bf3db43111 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/data_layout_transform.h"
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"
 
 TEST(DataTransform, DataLayoutFunction) {
   auto place = paddle::platform::CPUPlace();
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index f54311eebfade312057224ddda075c03fdc0666d..3a40de6988f294314f83ba0308e9d57de84d60f7 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,8 +18,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLDNN
-#include <algorithm>
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index ef2271d530d5f2c915467a08c00dae433284c663..2bbdac52ee49fd23bb036b4475f2310787a6ae14 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,6 +30,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OpKernelType;
+class Tensor;
+class Variable;
+
 void TransformData(const OpKernelType &expected_kernel_type,
                    const OpKernelType &kernel_type_for_var,
                    const Tensor &input_tensor, Tensor *out);
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 8188d5cde1b90436d040e8b9dcc1070ac85bf319..e4be866dca13585465610a3aff8bd7a4d54dd9c5 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_type.h"
-#include <stdint.h>
 #include <string>
 #include <unordered_map>
 
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 720e422e114835f367317d4ba265254856885c15..4477a9cac09d8f060d6b8f2f7d5bf4dfd7b3f893 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -15,12 +15,19 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <typeindex>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/platform/enforce.h"
 
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 331596da33acc151810cd616ea6d5bdcae333b30..5599edcd222ba4748ae4b88a668d79530a1681d0 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -14,9 +14,17 @@
 #include "paddle/fluid/framework/data_type.h"
 
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"
 
+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 TEST(DataType, float16) {
   using paddle::framework::Tensor;
   using paddle::platform::CPUPlace;
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
index 1c281b03ed61ac70e16a43d75a79854bdafd8836..b42b2f594aa7f36aeaff97d4b66d5924ba8ac60b 100644
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <utility>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
@@ -23,6 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OpKernelType;
+class Tensor;
+
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
 
 void TransDataType(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc
index 3a28c101d48342ef639956a974d59aee8ae42ed6..cdb513f70ad94861a1ba0cb6e90d72ace40213bb 100644
--- a/paddle/fluid/framework/dataset_factory.cc
+++ b/paddle/fluid/framework/dataset_factory.cc
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/dataset_factory.h"
-#include <memory>
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/framework/data_set.h"
-
 namespace paddle {
 namespace framework {
 typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)();
diff --git a/paddle/fluid/framework/dataset_factory.h b/paddle/fluid/framework/dataset_factory.h
index d4a36cec22fc0af27a38ee7cd810a2eaa7988ea1..425c488daa8689eb2faa56891c7d5c80d666fcf2 100644
--- a/paddle/fluid/framework/dataset_factory.h
+++ b/paddle/fluid/framework/dataset_factory.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_set.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index 29c4732f99118fe42f08317625ec07edf52ec217..e69fb4e761939d8e1ea490563c5921af1a30b7ef 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/dim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc
index b7b42fa019f6b2243719ebdb1628ea83c220abf9..e89f77ae496c499b7408d4e0836c2abe5cff9660 100644
--- a/paddle/fluid/framework/ddim_test.cc
+++ b/paddle/fluid/framework/ddim_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <sstream>
-#include <vector>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ddim.h"
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index 36f5d3adfadfbb2bef12145042db104f9ee4c1e3..e0064ec264223cafff3c6b93bfde841799ee7c3a 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,6 +20,17 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 588248d6454fbc1645b75ea2a8b8a3c662f1bbda..1412e2cd9dbb8319c5161fe5fdf0eda694d7dfea 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,6 +24,20 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 6d14c7e4e7bf23c2140195b5fb3a104fd9f3bb59..e455879a68f70b3f4f33fb5e6ede0fd9e9f22d5f 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -21,13 +21,15 @@
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
-
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+struct DummyVarHandle;
+struct VarHandle;
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 01d496d4ea7f7f0d0347b552e13d988fdc68e0c7..87b27eaa440cc4c546c5e5bfc38b01fc057d4991 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -27,6 +27,18 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class PassBuilder;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 0b653e57f6d48f9d919ee4f09db5b6ab6b2451b7..2256b826ed501f9974806d2ef20ef137d1ca0724 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -19,6 +19,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                          platform::Place place,
                                          size_t scope_idx)
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 41c51b95800f48afd25136d0521d0cfc4d2d8f6c..3c219ee27d3d6ed5e93e0d45acd457c84a77a38a 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -24,9 +24,21 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class OperatorBase;
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 class ComputationOpHandle : public OpHandleBase {
  public:
   ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index a048799a2807554f27ac54c400dc1bf8ae113e85..8edce6782de4aa7caeb7f324a375f568ea296c42 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -19,12 +19,23 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 class Scope;
+class GarbageCollector;
+namespace ir {
+class Node;
+}  // namespace ir
 
 namespace ir {
 class MemOptVarInfo;
diff --git a/paddle/fluid/framework/details/exception_holder_test.cc b/paddle/fluid/framework/details/exception_holder_test.cc
index c20563a08605086d6fd65506b5d0176bb8dce8bb..3db358667ebe9035b21babf8d9e873b7390dc3ea 100644
--- a/paddle/fluid/framework/details/exception_holder_test.cc
+++ b/paddle/fluid/framework/details/exception_holder_test.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/exception_holder.h"
-#include <memory>
-#include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 6aae523365ed50e78a78b318ac0990490c801eb3..09aedafc6bb2e1cc0223a4bcb67b8814616da121 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -15,9 +15,14 @@
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include <string>
 #include <utility>
-#include <vector>
 #include "paddle/fluid/platform/profiler.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h
index 691a3286c270badad938610811cc6e73d63c2c04..ff9271942daa2b45fa5836ae0b0c94d23e5ebfef 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -22,6 +22,18 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
index 127183a32e938de57ce4f7cb5aed4e72f8f09682..fc836ade786dda556421b7623a93c31b0b08228d 100644
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
@@ -19,6 +19,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 FetchBarrierOpHandle::FetchBarrierOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places)
diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.h b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
index d1f7e08b28e7d8291c11bd61588c978f591060c2..7ce790f38e8cbb7ebb110dc3a01d9c1e22e73cfe 100644
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
@@ -24,6 +24,15 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -32,6 +41,8 @@ namespace details {
 // all places if there are multiple places, must init with
 // multiple dev_ctxes_ !!!!
 
+struct VarHandleBase;
+
 struct FetchBarrierOpHandle : public OpHandleBase {
  public:
   FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 31ffd1211d205ff943f5e65b95bddcfa76bcd05a..41deeb0af27f904c5ea892a2a42e3ab3cc3b7216 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -22,6 +22,17 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index 16c13ac1c03e340786c95ac11019abd8c6ff0ef9..9bed792a42fc797d0af396dcaf0423c7e06eafe4 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -17,10 +17,22 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index 8a59d2bfa9acd7b541d4049d2088b0162d7a6350..8fd3ec56d18b64fdd157fdcd820897f4a1f7fc2f 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,6 +25,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index cbded074f205f423ac20580047503b50af0f368d..761a5b5a30a0e04690a7dc94752179130c85320a 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -17,11 +17,20 @@
 #include <unordered_map>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+struct VarHandle;
+
 struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
   std::vector<std::string> out_varnames_;
   std::vector<std::unique_ptr<ir::Node>> nodes_;
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index ac87b246b50f8e0df1d0cc082087d4128a79384b..9cbd94cd6b877b5392e31499720e61a0491acbe8 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -24,6 +24,14 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 5d8562e7046fd2f1609ba34ce2dd71b9fa28be77..f3fcc1a436df38986e1202755cd88f14069028a8 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -17,11 +17,11 @@
 #include <unordered_map>
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/platform/device_context.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
+struct DummyVarHandle;
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 21e781877a42754d215e48becb4bc7a1e6e2f262..c3a18433cf89d6241ca735e003c797535cc3b26c 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -20,16 +20,21 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/var_handle.h"
-
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 15d00932f1c448bf25d49469e0ca4d85ca6f1e1c..b4459e5a7c1cc6ad6faa9e19f39bff47fe128344 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -19,6 +19,12 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 097f54d5d5891390fdd479d3e6f62ae0e97cd0d4..eb3d9c32ffc1f45ab843a8d9f00b0918acd5e0f6 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,10 +24,22 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
 class Scope;
+namespace details {
+struct VarHandleBase;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
 
 namespace details {
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 8b92bdef4757729b911aee409092eee15b902bfd..e76a48d207d9b477cb024c039025b61a88829b76 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -24,6 +24,21 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+class SelectedRows;
+namespace details {
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
index d86d33dd676ca066b8772ac2afbab05cf0d98b38..909f565f2c048b96181bf4b7112ccf44e271ab2d 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -24,6 +24,16 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 6e2f2327abd62b12d9368992674ce399d50dff35..287667d5ee97ef8779f649761f93e367b5511b29 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -13,9 +13,17 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/profiler.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index d4f28dbe2b261be9c5d48aa50e38edfe36bfcfd3..02e5aa88443df1222520f50ebd44db9cc26d8e3f 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -21,6 +21,18 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.h b/paddle/fluid/framework/details/scope_buffered_monitor.h
index 1246c35af6aedab16c2370aa881e85be5a773ddc..3a94534eff4587f595f8ad83217ab93896a91132 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.h
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.h
@@ -17,7 +17,9 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/scope.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index 5fbaf3cbfe028638ad9219d9e1286480ae16ee6b..bf93d8f85b16cbe47373c1982c4eff2d678158c8 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -22,6 +22,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class Tensor;
+class Variable;
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index be49d1c432b2ab2b9741d873ba005b400e9f0829..0db69d07bf63a27b15d7541f88376ca42c2d1817 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -25,6 +25,15 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index be3f5515a971900258ab5914b579deffe5d5b7d6..3d53bb62855e092195fdff63e4b4a84fa2bb7eb3 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -23,10 +23,20 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
 
+class ComputationOpHandle;
+
 ComputationOpHandle *GetUniquePendingComputationOpHandle(
     ShareTensorBufferOpHandle *share_tensor_op) {
   ComputationOpHandle *result_op = nullptr;
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
index a02c346485eca813f0d0f0b432b8b647e2fe4414..d14cbc31d827906804959cb25a2208d6d3c338cd 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -22,10 +22,22 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class MemOptVarInfo;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
 
+class ComputationOpHandle;
+
 class ShareTensorBufferOpHandle : public OpHandleBase {
  public:
   ShareTensorBufferOpHandle(
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
index b24b457d21d8b7a17d8e2ae615791090f3859a97..8bfea0f1ae8b8a63f5a220382f74b2a8e116f5ef 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -23,6 +23,17 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index bb38424d3ae2d74f6f0a48e11df95b60dbf432f3..a35ac0bd732fd28faea862a860f82b6fe46cac67 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -24,6 +24,14 @@
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index fba0c1bf463ee0b9a434c350474af4be0c589e30..71e5dd28eded1c48ef49414df0a89fb072f0375c 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -15,6 +15,14 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 
 #include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
index ca9a19bdcf1be7bf0e1d2b0de560a38f528a2d2c..a882d5120bc66849dbdd5fc179b1aa530436808e 100644
--- a/paddle/fluid/framework/details/variable_visitor.h
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -17,6 +17,13 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index aeec6161714028352da3628027864e8660dad774..fbaae5a21c274780e79c6d7ac629b1874b6ae53f 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
-#include "xxhash.h"  // NOLINT
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Scope;
+
 void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
 
 void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index efe6fa1b2daffcbbaa2b7945e7139ac83f689bcd..f6f3098613ba194bea90a36efc3153cf63d2db5b 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -39,6 +39,18 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/timer.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class ProgramDesc;
+class Scope;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -62,7 +74,9 @@ class PullDenseWorker {
   virtual void Initialize(const TrainerDesc& param);
 #ifdef PADDLE_WITH_CUDA
   void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
+#endif
 
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
   }
@@ -123,9 +137,9 @@ class PullDenseWorker {
 
 #ifdef PADDLE_WITH_CUDA
   std::vector<cudaStream_t> copy_streams_;
+#endif
   std::vector<paddle::platform::Place> places_;
   std::vector<Scope*> thread_scopes_;
-#endif
 };
 
 // should incorporate different type of device
@@ -149,6 +163,7 @@ class DeviceWorker {
   virtual void SetDataFeed(DataFeed* data_feed);
   virtual void SetWorkerNum(int num) {}
   virtual void CacheProgram(const ProgramDesc& main_program) {}
+  virtual void GetXpuOpIndex() {}
   virtual void SetNeedDumpField(bool need_dump_field) {
     need_dump_field_ = need_dump_field;
   }
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index 67be8db6e80329de4323e4cb8f904a24753f56bc..3b60cb65e34b4f4728ce7fae11b77364f284935f 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class DeviceWorker;
+
 typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)();
 typedef std::unordered_map<std::string, Createdevice_workerFunction>
     device_workerMap;
diff --git a/paddle/fluid/framework/device_worker_factory.h b/paddle/fluid/framework/device_worker_factory.h
index 9d0613385e78c9f482840677c71f621e53ed85b5..6a31c3ea7a414fb88eb1b5abb57911617f800acd 100644
--- a/paddle/fluid/framework/device_worker_factory.h
+++ b/paddle/fluid/framework/device_worker_factory.h
@@ -16,11 +16,14 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/device_worker.h"
 
 namespace paddle {
 namespace framework {
 
+class DeviceWorker;
+
 class DeviceWorkerFactory {
  public:
   static std::string DeviceWorkerTypeList();
diff --git a/paddle/fluid/framework/device_worker_test.cc b/paddle/fluid/framework/device_worker_test.cc
index b488e4cfe7a013375a55b19e854bde0f5024df21..461d329a371bfae700c238a009ec0821ed3a7297 100644
--- a/paddle/fluid/framework/device_worker_test.cc
+++ b/paddle/fluid/framework/device_worker_test.cc
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/device_worker.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/trainer.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index a24cc4cb55c0491ed9be0298e4fbac4f2434b6d0..8b6985cb2577a1f6e6a0e6bc9fb6796892c14199 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -128,6 +128,7 @@ message DistributedStrategy {
   optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
   optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
   optional bool adaptive_localsgd = 24 [ default = false ];
+  optional bool fp16_allreduce = 25 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 915589b3242b7d5675e630aca7310185fd109ec2..ac1e39ad2c1af6894d6bbaec563c487a6857f95a 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -11,10 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/dlpack_tensor.h"
 #include <unordered_map>
-
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/dlpack_tensor.h"
+
+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index 5346ba62894308ea8910739d565f2e3f0fa8c8c0..e342523718b34b3e32e54d0ffd14128a43df34f7 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -15,11 +15,14 @@
 #pragma once
 
 #include <dlpack/dlpack.h>
+
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace framework {
 
+class Tensor;
+
 class DLPackTensor {
  public:
   using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 4dead063b477980ace5dabfead1447cb297bbdd2..4a1f151f69b2d46f55d2bcf8a701e2ecc156b3df 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -17,6 +17,12 @@
 #include <gtest/gtest.h>
 #include <vector>
 
+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 1c64bf1d3f7f31f42308395b5b054f62fd97b429..00f721701a4a55980c8df3079d88cbeeba116f49 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
 
 #if defined _WIN32 || defined __APPLE__
 #else
diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc
index b40a00ef9cb8cf3f51fdca4d71a905ac912db51f..afe6ddfa3d9a6361cbbb63047114c01bbe819b8c 100644
--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <set>
-#include <unordered_map>
-#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/lodtensor_printer.h"
 
 namespace paddle {
 namespace framework {
 
+class OpDesc;
+class OperatorBase;
+class ProgramDesc;
+
 bool HasDependentOutput(const OpDesc& op_desc,
                         const std::unordered_set<std::string>& dependent_vars) {
   for (auto& var : op_desc.Outputs()) {
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 0e3edfb95cb9b37543ce84ba9a22227d2761734a..a6abda8a83bc8484ccc9017d32bc04b7df32e90c 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <stdint.h>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index fa6a65d5892f0098a95d1e30b11501bc1043b14f..7593b60abfffcd9a0a3e9f743930660327c1409e 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
@@ -32,6 +33,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Dataset;
+class ProgramDesc;
+class Scope;
+class TrainerBase;
+
 struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
 
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 706248229bc27e553fbc136116ab616f371eed5e..c80eedb1b86f771d1c6bf97442733e90bae374e0 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -13,16 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/executor_gc_helper.h"
+
 #include <deque>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
-#include <vector>
+
 #include "glog/logging.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h
index a4c71c5304e05e3d0dca6ca08d955f39b779556b..e44edc5aa1c810f859942a62763e0c9179885987 100644
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
@@ -26,6 +27,10 @@ namespace paddle {
 namespace framework {
 
 // Result map: op -> variable names that can be deleted after op runs
+class GarbageCollector;
+class OperatorBase;
+class Scope;
+
 std::unordered_map<const OperatorBase *, std::vector<std::string>>
 GetUnusedVars(const BlockDesc &block,
               const std::vector<std::unique_ptr<OperatorBase>> &ops,
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index fd857f7735c1db2b5b3678517e5301d4ab8700ef..3bd85b2b24b97b22132c9dae792361dc7b9ad734 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -13,16 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
+
 #include <string>
-#include <vector>
+
 #include "glog/logging.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Variable;
+
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
                      const std::string& var_name, size_t index) {
   // If var_name Variable is not found in GlobalScope, a new variable will
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index 65c8b255ffb2fbe8a054dd871bccd665d284968d..a52ef517c8b7342522aeb7d699a19d22619d3d0b 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -15,12 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Scope;
+
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
                      const std::string& var_name, size_t index);
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 34fff042770c5f50a280408d8f7f925488b3879c..693073d1fc73a65bd17e34da864f9d8df019043c 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -29,12 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include <algorithm>
 #include <utility>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
 namespace framework {
@@ -751,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
     push_sparse_status->push_back(std::move(status));
   }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+void FleetWrapper::PushDenseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names,
+    std::vector<::std::future<int32_t>>* push_sparse_status,
+    float scale_datanorm, int batch_size,
+    const paddle::platform::Place& place) {
+#ifdef PADDLE_WITH_PSLIB
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g_data = tensor->data<float>();
+
+    Variable* pin_var = scope.FindVar(t + "pin");
+    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
+    float* pin_g =
+        pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+    memory::Copy(platform::CPUPlace(), pin_g,
+                 BOOST_GET_CONST(platform::XPUPlace, place), g_data,
+                 sizeof(float) * count);
+
+    float* g = pin_g;
+    if (scale_datanorm >= 0) {
+      if (t.find(".batch_size@GRAD") != std::string::npos ||
+          t.find(".batch_sum@GRAD") != std::string::npos) {
+        Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
+        float scale = 1.0 / batch_size;
+        mat *= scale;
+      } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
+        VLOG(3) << "epsilon: " << scale_datanorm;
+        for (int i = 0; i < count; ++i) {
+          g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
+                 batch_size * scale_datanorm;
+        }
+      }
+    }
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }
 
+  auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  if (push_sparse_status) {
+    push_sparse_status->push_back(std::move(status));
+  }
+#endif
+}
 #endif
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope, const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index cc13a50160a94c63345bcbd5633f2d3f8555ae0c..ae86835f38df77a3a7661433501cdd2440553d17 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -35,6 +35,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
@@ -154,6 +160,14 @@ class FleetWrapper {
       float scale_datanorm, int batch_size,
       const paddle::platform::Place& place, cudaStream_t stream,
       cudaEvent_t event);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void PushDenseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<::std::future<int32_t>>* push_sparse_status,
+      float scale_datanorm, int batch_size,
+      const paddle::platform::Place& place);
 #endif
   void PushDenseVarsAsync(
       const Scope& scope, const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index f195dde40843c8c4ee5168d11ad0b8eac8199f4e..f4b2d2d7d1881d2ad5e9ca50156923e27ce55316 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -10,10 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#include <thread>  // NOLINT
-#include <vector>
 #include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace gloo {
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index b70d5e5fc1ae6c90dac4ebf1d86353e38a79492d..8e232560ab6876995a735b6901a5459265f9cb05 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -27,15 +27,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/timer.h"
 #ifdef PADDLE_WITH_PSLIB
 
 namespace paddle {
@@ -122,29 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
   if (platform::is_cpu_place(tensor->place())) {
     memcpy(data_ptr, tensor->data<void>(),
            tensor->numel() * SizeOfType(tensor->type()));
-  }
+  } else {
 #ifdef PADDLE_WITH_CUDA
-  else {
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                  tensor->data<void>(),
                  tensor->numel() * SizeOfType(tensor->type()), nullptr);
-  }
 #endif
+#ifdef PADDLE_WITH_XPU
+    memory::Copy(platform::CPUPlace(), data_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * SizeOfType(tensor->type()));
+#endif
+  }
 }
 
-// void HeterWrapper::DeSerializeToTensor(Scope* scope,
-// const HeterRequest* request) {
 #ifdef PADDLE_WITH_CUDA
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
                                        cudaStream_t stream) {
+  // const VariableMessage& req_var = request->vars();
+  auto* var = scope->FindVar(req_var.varname());
+  auto* tensor = var->GetMutable<LoDTensor>();
+
+  std::vector<int> vec_dim;
+  for (auto& x : req_var.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(make_ddim(vec_dim));
+
+  LoD lod;
+  for (int i = 0; i < req_var.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
+      v.push_back(req_var.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(place, ToVarType(req_var.data_type()));
+
+#ifdef PADDLE_WITH_CUDA
+  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+               platform::CPUPlace(), req_var.data().data(),
+               tensor->numel() * SizeOfType(tensor->type()), stream);
 #else
+  memcpy(tensor_data, req_var.data().data(),
+         tensor->numel() * SizeOfType(tensor->type()));
+#endif
+}
+#endif
+
+// void HeterWrapper::DeSerializeToTensor(Scope* scope,
+// const HeterRequest* request) {
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place) {
-#endif
   // const VariableMessage& req_var = request->vars();
   auto* var = scope->FindVar(req_var.varname());
   auto* tensor = var->GetMutable<LoDTensor>();
@@ -168,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data =
       tensor->mutable_data(place, ToVarType(req_var.data_type()));
 
-#ifdef PADDLE_WITH_CUDA
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+#ifdef PADDLE_WITH_XPU
+  memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
                platform::CPUPlace(), req_var.data().data(),
-               tensor->numel() * SizeOfType(tensor->type()), stream);
+               tensor->numel() * SizeOfType(tensor->type()));
 #else
   memcpy(tensor_data, req_var.data().data(),
          tensor->numel() * SizeOfType(tensor->type()));
@@ -192,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
     case VariableMessage::BOOL:
       return framework::proto::VarType::BOOL;  // NOLINT
     default:
-      VLOG(0) << "Not support type " << type;
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "ToVarType:Unsupported type %d", type));
   }
 }
 
@@ -239,7 +268,7 @@ void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
   request.set_cur_batch(task->cur_batch_);
 
   OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
-    auto* closure = (OnHeterRpcDone*)done;
+    auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
     if (closure->cntl.Failed()) {
       VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
     } else {
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index 33a91388fd8cc97d181df46ab826d384860d38f5..ed92e2e9aadb36b026334c1074bdd60d11beeb1b 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -13,9 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/fleet/nccl_wrapper.h"
-#include <utility>
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
index a55921f1ac2a1204965433cac12ba2ca2e19367e..3725a225dbecfec0b6c6b934b259d895eb09c9cb 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <random>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -29,6 +30,12 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 4f7739652822b9047b1798b6bd66261effbe2f49..884d230816b2ecae61c332f024fa8a4c9b0f2070 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -19,9 +19,16 @@
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index a279c2e4e1458293b6579b7b7cb2111e440e5d5e..862e63c4c6af5a503a0fe04a8966d543ef23e55e 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <stdint.h>
-
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug
diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
index 83838f4df67d0bcbd9fb8ec8fb6762641287d2c4..747fd434ae7833b999918a0a52928df6b28891e4 100644
--- a/paddle/fluid/framework/hetercpu_worker.cc
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -12,13 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
-
 #ifdef PADDLE_WITH_PSLIB
 
 #if defined _WIN32 || defined __APPLE__
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 5ca1aa66319228fc95f63294b15c981dd3c8ba30..6bbbaacdde3b30a8956794b650e2ff7b1f503a59 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -22,9 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
-
+#endif
 namespace paddle {
 namespace framework {
 
@@ -44,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
   int place_num = trainer_desc.worker_places_size();
   for (int i = 0; i < place_num; ++i) {
     int num = trainer_desc.worker_places(i);
+#ifdef PADDLE_WITH_CUDA
     platform::CUDAPlace place = platform::CUDAPlace(num);
     platform::CUDADeviceGuard guard(place.device);
     cudaStream_t stream;
@@ -54,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     events_.push_back(event);
+#endif
+#ifdef PADDLE_WITH_XPU
+    platform::XPUPlace place = platform::XPUPlace(num);
+    places_.push_back(place);
+#endif
   }
   // thread_num_ = trainer_desc.thread_num();
   // SetDataset(dataset);
@@ -105,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
 void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
   auto place = places_[num];
   Scope* scope = place_scopes_[num];
+#ifdef PADDLE_WITH_CUDA
   auto stream = copy_streams_[num];
   auto event = events_[num];
-
   auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
   platform::CUDADeviceGuard guard(dev_id);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+  xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+#endif
+
   auto& block = program.Block(0);
   for (auto& var : block.AllVars()) {
     if (var->Persistable()) {
@@ -126,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
       HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
     }                                                                   \
   } while (0)
+
+#define HeterMemcpyXpuFunc(cpp_type, proto_type)                \
+  do {                                                          \
+    if (root_tensor->type() == proto_type) {                    \
+      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
+    }                                                           \
+  } while (0)
+#ifdef PADDLE_WITH_CUDA
       _ForEachDataType_(HeterMemcpyFunc);
+#endif
+#ifdef PADDLE_WITH_XPU
+      _ForEachDataType_(HeterMemcpyXpuFunc);
+#endif
     }
   }
+#ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
   cudaEventSynchronize(event);
+#endif
 }
 
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                                   LoDTensor* root_tensor,
@@ -151,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                  root_ptr, sizeof(T) * root_tensor->numel(), stream);
   }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+template <typename T>
+void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
+                                  LoDTensor* root_tensor,
+                                  const paddle::platform::Place& thread_place) {
+  T* thread_ptr =
+      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
+  T* root_ptr = root_tensor->data<T>();
+  if (platform::is_cpu_place(root_tensor->place())) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 platform::CPUPlace(), root_ptr,
+                 sizeof(T) * root_tensor->numel());
+  } else {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
+                 root_ptr, sizeof(T) * root_tensor->numel());
+  }
+}
+#endif
 
 void HeterXpuTrainer::DumpWork(int tid) {}
 
@@ -181,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
     CreateThreadParam(main_program, i);
     pull_dense_worker_->AddThreadScope(scope);
     pull_dense_worker_->AddPlace(places_[i]);
+#ifdef PADDLE_WITH_CUDA
     pull_dense_worker_->AddStream(copy_streams_[i]);
+#endif
   }
-
   pull_dense_worker_->Start();
+#ifdef PADDLE_WITH_CUDA
   for (auto& stream : copy_streams_) {
     cudaStreamSynchronize(stream);
   }
+#endif
   op_names_.clear();
   for (auto& op_desc : block.AllOps()) {
     std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
@@ -230,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
         OperatorBase* local_op_ptr = local_op.release();
         (context->ops_).push_back(local_op_ptr);
       }
+#ifdef PADDLE_WITH_CUDA
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::CUDADeviceGuard guard(dev_id);
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
       object_pool_.Push(context);
     }
   }
@@ -277,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
     }                                                                          \
   } while (0)
       _ForEachDataType_(MergeCallback);
-      if (platform::is_gpu_place(thread_tensor->place())) {
+      if (!platform::is_cpu_place(thread_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
         auto dev_id =
             BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
         platform::CUDADeviceGuard guard(dev_id);
         cudaMemset(thread_tensor->data<void>(), 0,
                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+        auto place = thread_tensor->place();
+        xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        platform::DeviceContext* dev_ctx = pool.Get(place);
+        const platform::XPUDeviceContext* xpu_ctx =
+            reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+        xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
+                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
       } else {
         memset(thread_tensor->data<void>(), 0,
                thread_tensor->numel() * SizeOfType(thread_tensor->type()));
@@ -291,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
     auto* merge_var = response->add_vars();
     heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
                                merge_var);
-    if (platform::is_gpu_place(root_tensor->place())) {
+    if (!platform::is_cpu_place(root_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
       auto dev_id =
           BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
       platform::CUDADeviceGuard guard(dev_id);
       cudaMemset(root_tensor->data<void>(), 0,
                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+      auto place = root_tensor->place();
+      xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::DeviceContext* dev_ctx = pool.Get(place);
+      const platform::XPUDeviceContext* xpu_ctx =
+          reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+      xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
+                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
     } else {
       memset(root_tensor->data<void>(), 0,
              root_tensor->numel() * SizeOfType(root_tensor->type()));
@@ -334,7 +415,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
   std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
 
   if (!context->scope_) {
-    int num = rand() % places_.size();
+    int num = rand_r() % places_.size();
     context->place_num_ = num;
     auto place = places_[num];
     context->scope_ = &(place_scopes_[num]->NewScope());
@@ -356,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       OperatorBase* local_op_ptr = local_op.release();
       (context->ops_).push_back(local_op_ptr);
     }
-
+#ifdef PADDLE_WITH_CUDA
     auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     platform::CUDADeviceGuard guard(dev_id);
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
   }
 
   context->Reset();
@@ -369,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
     auto deserial_timer =
         std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
     for (int i = 0; i < request->vars_size(); ++i) {
+#ifdef PADDLE_WITH_CUDA
       heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
                                       copy_streams_[context->place_num_]);
+#endif
+#ifdef PADDLE_WITH_XPU
+      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
+#endif
     }
+#ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
     while (cudaEventQuery(context->event_) != cudaSuccess) {
       VLOG(3) << "wait for kernel";
       bthread_yield();
     }
+#endif
   }
 
   {
@@ -388,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       op->Run(*(context->scope_), place);
     }
   }
+#ifdef PADDLE_WITH_CUDA
   auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
       platform::DeviceContextPool::Instance().Get(place));
   PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -401,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       bthread_yield();
     }
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  xpu_wait();
+#endif
 
   for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
     const std::string& varname = trainer_desc_.xpu_send_list(i);
@@ -417,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
        ++i) {
     uint64_t tid =
         static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
+#ifdef PADDLE_WITH_CUDA
     fleet_ptr_->PushDenseVarsAsync(
         *(context->scope_), tid, dense_grad_names_[tid],
         &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
         places_[context->place_num_], copy_streams_[context->place_num_],
         context->event_);
+#endif
+#ifdef PADDLE_WITH_XPU
+    fleet_ptr_->PushDenseVarsAsync(
+        *(context->scope_), tid, dense_grad_names_[tid],
+        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
+        places_[context->place_num_]);
+#endif
   }
   for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
        ++i) {
@@ -463,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
   pull_dense_worker_->Stop();
   root_scope_->DropKids();
 }
-
 }  // namespace framework
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/inlined_vector.h b/paddle/fluid/framework/inlined_vector.h
index 2a7f26b9f9661ecc79112307bac2780e357a55b4..f8e937fa107bc6f4d7c10ec8fe59283f85796879 100644
--- a/paddle/fluid/framework/inlined_vector.h
+++ b/paddle/fluid/framework/inlined_vector.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cstdint>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/inlined_vector_test.cc b/paddle/fluid/framework/inlined_vector_test.cc
index 003c0d7bbeac2b3d8ed62766fee09912c7a07bb2..581e7d8934ddec4b64d706d33448300f81ddbc91 100644
--- a/paddle/fluid/framework/inlined_vector_test.cc
+++ b/paddle/fluid/framework/inlined_vector_test.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/inlined_vector.h"
+
 #include <cstdlib>
 #include <ctime>
-#include <iostream>
-#include <vector>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.cc b/paddle/fluid/framework/io/crypto/cipher_utils.cc
index e0c653e001628b17adcb09f9d70cfb183e09860e..ee9f06b2f3eb10d22a2f50c80827f0f80259bb35 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.cc
@@ -15,8 +15,6 @@
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 
 #include <cryptopp/osrng.h>
-
-#include <fstream>
 #include <sstream>
 
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.h b/paddle/fluid/framework/io/crypto/cipher_utils.h
index 936f62f6ba65cfe26be56ed2dd828f1e42e8d1b2..52db03f530c34e58b963f48aa93c757fac534185 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.h
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.h
@@ -17,6 +17,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_map>
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h
index c88636e267422c27696341df145346e33018cdaf..bb6d720ca584c16aa5898cb72483e9ed83e77fcc 100644
--- a/paddle/fluid/framework/io/fs.h
+++ b/paddle/fluid/framework/io/fs.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/io/shell.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h
index dc486275d6f58eaa7a360b8f17830acd664b11c7..7db5cd7661cd7670f90fc0d788eda6f98b46cf37 100644
--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -32,6 +32,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
index 47ed9f0393fb222e612ed3bce1afbc879edb410d..48e3989a5314c613209718a313b076f4ce208ebc 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class AttentionLSTMFusePass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index fd8b55a6b7deb9bf4685b27f8849a49ab77f64e9..9c984a23e377d749947a61793838956079a3678b 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -13,19 +13,28 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+
 #include <cmath>
-#include <functional>
-#include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 #define GET_CONV_BN_NODES(pattern_name)                                    \
   /* OPERATORS */                                                          \
   GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
index d607020a47b8c589775ac763f04e64272dfec4e0..916384ec44704537f472c8b99bc5766489bd1ced 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Fuse the Conv and ConvAffineChannel.
  */
+class Graph;
+
 class ConvAffineChannelFusePass : public FusePassBase {
  public:
   virtual ~ConvAffineChannelFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index fb787e08814429781bf64efda2f1eb915f185f63..a915015bf55bd8a93fcc8311abc871e11cb9402d 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -13,15 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
-#include <algorithm>
-#include <functional>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index 57a9f69ca15af2759874a1e2a0b58399de652693..342cd8dad5fb959a11df6c50fda4f22bb73ec5ba 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp.
  */
+class Graph;
+
 class ConvBNFusePass : public FusePassBase {
  public:
   virtual ~ConvBNFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
index ea9e465d8d765a298215db29c77aa58e727fd15e..e68f57d4ae998203c6f34aee7cca11d69a5e6d3f 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ConvElementwiseAdd2ActFusePass : public FusePassBase {
  public:
   virtual ~ConvElementwiseAdd2ActFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index c5fa47ec55fe9a15caca493a4b0c72c22f2cf5c7..93e6e13ff7092c80958d5defb11e2e456298c7b7 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
index 8b34c3551d8f9b54f01e52cc0fc896901cd7df99..933092c7db7d38d722af9392e71cd0c1797f0eee 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ConvElementwiseAddActFusePass : public FusePassBase {
  public:
   virtual ~ConvElementwiseAddActFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 38c0b773ddeddddea68ecefd6c8525449c52d7a6..e4396f227f7f5280cfd3057aebfed8d02480d154 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
index 66a562cdd1948980a6792a53713cac947d72e7d6..7198a7488e052b5bdbe52d662b903d9f90c51da0 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ConvElementwiseAddFusePass : public FusePassBase {
  public:
   virtual ~ConvElementwiseAddFusePass() {}
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass.h b/paddle/fluid/framework/ir/cudnn_placement_pass.h
index 99dd3a175d37adabe4fab3888e94fe080631086b..8d84c2bf707956c4a00454a6dc66efcb42bec816 100644
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/cudnn_placement_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/placement_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index 4dfbd5e00c15cf713cc42a542bbb1a019b6490d9..886b080c6624c2577327db9753839f6a8c9740ad 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -12,10 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
 #include "paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
index 938ada6453edf4e3ce324aa787e9d23905869d93..fea0498fdecd26a723692338bdcd51b5335be501 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
@@ -21,6 +22,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class DeleteQuantDequantOpPass : public FusePassBase {
  public:
   virtual ~DeleteQuantDequantOpPass() {}
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 3f88a460d140f6d7389194a29e37128f5ba5b458..51861b402d58aa1224fbbfbc1476ed848716d5f7 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h"
-#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
index 644eb1cf89221c4e6e22e3d767b4b802702d7b88..25049d7468b152e72ad5f32fb38d9204f7219dff 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
@@ -17,10 +17,19 @@
 #include <memory>
 #include <string>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
index 65cb4439727b466506af35df1bed609b18c06ee0..a9cde13758b07066193992637e7048a67540c2fd 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -27,6 +27,8 @@ namespace ir {
 // Fusing of Embedding , FC and LSTM op
 
 // Just FC without bias
+class Graph;
+
 class EmbeddingFCLSTMFusePass : public FusePassBase {
  public:
   virtual ~EmbeddingFCLSTMFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
index 5c18a0d6c7ffe43a882f8f190f3449d6778c1bd0..bedb968964123df59b9cd68726c5718381a349b4 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
index ac4d0b39ee267c724636954263aa2dce9d9ec47f..12e4c44b84e87bb710774ebba0ba2853d8b37f5e 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class FCElementwiseLayerNormFusePass : public FusePassBase {
  public:
   virtual ~FCElementwiseLayerNormFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index d60510a4074997a028cd914ca7a0e76335801c80..0248aeedd0afeda155c19fb9ff59baf1d9197ea6 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-#include <memory>
+
 #include <string>
-#include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index ef6636d109a69d32aa50044b9686254e5d7769a5..f564bbb151854fe325975285b18d25b517336014 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -26,6 +26,8 @@ namespace ir {
 /*
  * Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
  */
+class Graph;
+
 class FCFusePass : public FusePassBase {
  public:
   virtual ~FCFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index f5fea90ac2fcee8e9c48ca21203b3b60cd7f7166..c4515bbc45538ca211382aa119bbec5721c56c5a 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 static int BuildFusion(Graph* graph, const std::string& name_scope,
                        Scope* scope, bool with_fc_bias) {
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
index e11cdac7ea95219444c35bb8deef630fe29d3734..73f00504d34d5f1cfddbc3826f7a84e6925fc9f3 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -25,6 +26,8 @@ namespace ir {
 
 // The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
 
+class Graph;
+
 class FCGRUFusePass : public FusePassBase {
  public:
   virtual ~FCGRUFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index a3c57e14e1aedbed1e4cf462d4883cd83bf2fa10..2b451da7bfa8b089c0f891ce42fbc293b19ac4b1 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
                 bool with_fc_bias) {
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 5dea7c91a860f0b9622610f12f195eafb9849555..d37f53b15f06b72e67c234baec3a314f0f462735 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -27,6 +27,8 @@ namespace ir {
 // The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
 
 // Just FC without bias
+class Graph;
+
 class FCLstmFusePass : public FusePassBase {
  public:
   virtual ~FCLstmFusePass() {}
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 54c05046a2c2f2f56c20a32b8ca32578abe7af31..db3c711201dc76766b4ec29c9ce1590567971939 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -15,12 +15,17 @@
 #include "paddle/fluid/framework/ir/fuse_bn_act_pass.h"
 #include <algorithm>
 #include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.h b/paddle/fluid/framework/ir/fuse_bn_act_pass.h
index 427ff03a803c991ee908dcd4e17379f19fd873af..7e5f046ecaa01cb1398b15cc572fb5bd00d0a5fe 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.h
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,6 +31,9 @@ namespace ir {
 /*
  * Fuse the BatchNorm and activation.
  */
+class Graph;
+class Node;
+
 class FuseBatchNormActPass : public FusePassBase {
  public:
   virtual ~FuseBatchNormActPass() {}
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
index dc73f1fda03e130c6876819d91897b497b8b321e..d9356b7bd7221347eec3a9bc5f3ebb7ffd6ecde9 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -17,6 +17,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,6 +30,9 @@ namespace ir {
 /*
  * Fuse the ElewiseAdd and activation
  */
+class Graph;
+class Node;
+
 class FuseElewiseAddActPass : public FusePassBase {
  public:
   virtual ~FuseElewiseAddActPass() {}
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index c284c1f4587cd6dd5c8eacc43968f45e4fbef699..0094b674c2a176605f1fb13c2a759d75833f20a3 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -11,19 +11,22 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
+#include <sys/types.h>
 #include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 class FuseAdamOpPass : public FuseOptimizerOpPass {
  private:
   const std::string GetOpType() const { return "adam"; }
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index 43ec8bff5edc10cbfc48c06a2e35a5a46ed7043c..f87d31cbc409c7c4db634e994738d106d2a8146e 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -12,18 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
 #include <string>
-#include <unordered_map>
-#include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 class FuseMomentumOpPass : public FuseOptimizerOpPass {
  private:
   virtual const std::string GetOpType() const { return "momentum"; }
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
index 0b5bf8a3a4ba5d2d819529718cf018530d67d017..5b7e1b7d384c8211e22ea09ffd5e730bb3d70d6a 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
@@ -19,14 +19,25 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
+namespace paddle {
+namespace framework {
+class BlockDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 constexpr char kGrad[] = "Grad";
 constexpr char kParam[] = "Param";
 constexpr char kLearningRate[] = "LearningRate";
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
index 70d4d2b865230078889115b809d8617b4415cc99..720d252c9a6b4bac733fa1d836e24ec168ca10ad 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
@@ -11,17 +11,21 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
 #include <string>
-#include <unordered_map>
-#include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 class FuseSgdOpPass : public FuseOptimizerOpPass {
  private:
   virtual const std::string GetOpType() const { return "sgd"; }
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
index e6fb1302e275fa2635542baf824c5e3333c2f5c8..f3db4f02b1c5fd4f0a3ccab23fbeec767188e847 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
@@ -15,10 +15,18 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include <unordered_map>
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void FusePassBase::Init(const std::string& repr, Graph* graph) const {
   repr_ = repr;
   graph_ = graph;
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index 3a1022bbcbd671391fb034bdff7c3cf97952f84d..ce7635bb35ce6108b4a5a356c8fb99269dbf2890 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -15,14 +15,24 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
 // When we use trt or other third_party lib, the parameters are managed by
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
index d37c153dd2a05ecfc8f0626626bbc3ed2f85968b..0b1dfaa6928b8d1bab7e08cc668e00ac30bbffff 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -15,6 +15,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -27,6 +28,8 @@ namespace ir {
 /*
  * Fuse the relu and depthwise conv
  */
+class Graph;
+
 class FuseReluDepthwiseConvPass : public FusePassBase {
  public:
   virtual ~FuseReluDepthwiseConvPass() {}
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.h b/paddle/fluid/framework/ir/fusion_group/code_generator.h
index 21773f239b9f6e5208aea45f481bf6f92745033f..15d21cf6829d40bfcdf143f1285ebefe43037056 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
 #include "paddle/fluid/framework/ir/fusion_group/subgraph.h"
 
@@ -27,6 +28,8 @@ namespace framework {
 namespace ir {
 namespace fusion_group {
 
+class SubGraph;
+
 class CodeGenerator {
  public:
   CodeGenerator();
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index ebc89b14c265d3491f0f9bc64a36f52c6c9f2a18..2a7a0748cf0e6569b9be61bc84c1c7e1b0a29820 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -12,18 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
 #include <gtest/gtest.h>
 #include <cmath>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/init.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
 
 #ifdef PADDLE_WITH_CUDA
 
diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
index 0861c2f7e96349d2d0d36d21e6ab19d98c2910e3..96b38f650131dae53450330b5f3368d42a7e52fa 100644
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
@@ -15,9 +15,18 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
index 3438783c180d7c45c506a9244e6a612db0b67138..5ca785846a522aaa2cf41ed0f24edb1a8ae3fa45 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/fusion_group/subgraph.h"
 
@@ -23,6 +24,11 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+namespace fusion_group {
+class SubGraph;
+}  // namespace fusion_group
+
 class FusionGroupPass : public FusePassBase {
  protected:
   void ApplyImpl(Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.h b/paddle/fluid/framework/ir/fusion_group/operation.h
index 74abbdaad0bdfbd6115f47cc1fc5f53608b7f744..d99fe737504e52e0867f531c7cfa03d0dfbf07a4 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.h
+++ b/paddle/fluid/framework/ir/fusion_group/operation.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 137ab7a56d7c409326b4ed6d0f4b2b881f50eaea..593ac214e56f9722538c89ff80ec9d6b98fcd8ae 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -25,6 +25,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 074ad320fb122bd49fe144f803b47ec9768b3504..0c43febca70b7d44be6c399263d11a5900f9b109 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -28,6 +28,8 @@ namespace framework {
 namespace ir {
 
 // Compare nodes via node id.
+class Graph;
+
 struct NodeComp {
   bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
     return node1->id() < node2->id();
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index d8973d5aeda1a2e0650a506b4c916b4346f01e2d..0a2dcfed000c9f4bbddc280912dc8966044c265d 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
-#include <string>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 053c1fe832b0088d2abdd3f8eb40a0042e5e2dfe..7116b8a2a6f353fc40fcba86e8cba3b2f37855bc 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -27,11 +27,21 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
index 6c466fb21fb46e09961dc874e9e39655f83d17c6..5ac5a5d983992746c8bbbb318c41d04a0cf76fae 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 void BuildGraph(Graph* g) {
   ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
   ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
index 52c8f4e0fcafcd42647b323a20fee7c7cf167b3a..6b17c0076f6fd7c8b7e64f96cda4ee9835236913 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 const char kGraphToProgramVarsToRemove[] =
     "__graph_to_program_vars_to_remove__";
 const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
index 5ee6b8a5f1e4e7415adfac6b51e9d3ae8e3062a9..80d7839d700b6de370d1ad8eb0d3ec7884571d4b 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-
 #include <memory>
 #include <string>
 #include <unordered_set>
@@ -25,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 void BuildNoCircleGraph(Graph* g) {
   OpDesc op1;
   op1.SetType("op1");
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index 4b403c46260c6129451809f276aac67ccc17c4d4..3fa84554d99bc46f094fecabe67a58e5b1d7e73b 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -13,10 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
-
 #include <set>
-#include <utility>
-#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -25,6 +22,8 @@ namespace ir {
 //
 // NodesDFSIterator
 //
+class Node;
+
 NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
   for (auto *x : source) stack_.push(x);
 }
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
index bb4212bcd33d77cfe1c091b18387e18c4c3e5fa7..a54cc61a63fdee3e35b6fc38da71070b4411e26c 100644
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -26,6 +26,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 template <typename IteratorT>
 class iterator_range {
   IteratorT begin_, end_;
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index 7091aa6a95bd9ebde10bfbd45c98f8757b9d06c4..118c1bc6f3c78e6ec05c897939343df83b1c3851 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -28,6 +28,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__";
 
 class GraphVizPass : public Pass {
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index c8dfa02f469a351a8d3495bf19238a723029bb4b..08d09fce5de9c932243f70b18a4c45dc88885b58 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init("identity_scale_op_clean", graph);
 
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
index d66b411257e530fa5188091702b0b309652ffaa4..7e3d4e19fa84deccce889d18a7840baeff21cfb3 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class IdentityScaleOpCleanPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index bf6fe999c1e68c35bc2c19fe38646da93bb1e204..9c1640efcd851c3ba951aa0843c22ad7a000dce5 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void IsTestPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
              "for activations and pooling.";
diff --git a/paddle/fluid/framework/ir/is_test_pass.h b/paddle/fluid/framework/ir/is_test_pass.h
index 80cedbf9f850f6fe31c9f2898264e19ebf931c72..abf48480d71aefcf85ba6aec946750114cce5482 100644
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class IsTestPass : public Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index f38f48fcd92a6b672254b3d1dda44671652b8ddb..26ec61fd36eb3c309d864221b41681f3d712d81f 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -27,6 +27,7 @@ namespace framework {
 namespace ir {
 
 class Node;
+class Graph;
 
 /*
 * Remove the sum op of all gradients of the backward op.
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
index b1afa47910fadfaf3560d15cb0bbe88ae0da7371..b12b84d4a491b3747a59b99b52903ddff2509fdf 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
@@ -12,24 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
+
+#include "glog/logging.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 using OpHandleBase = details::OpHandleBase;
 using ComputationOpHandle = details::ComputationOpHandle;
 using VarHandle = details::VarHandle;
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index ce7f27d27559c70cf164f6bb641fa0ee6f02a2a0..0cdde5c757aaf0e73281f889dff63d2ca1049f52 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -22,13 +23,15 @@
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class BufferSharedInplaceOpPass : public MemoryReusePass {
  protected:
   std::string ReuseType() const override { return "inplace"; }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 3e3b9864a7b408267ac73de053c1692628e9a14c..72e29dfe156e83db06f78d025106b564064b955d 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -16,10 +16,15 @@
 
 #include <functional>
 #include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+namespace details {
+class ComputationOpHandle;
+class ShareTensorBufferOpHandle;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
index 1c0c6ae60205b14f97bd15bceeb126d0eb54f654..4a77d116f1e9bb425b5f3375b2d4c650017c2f75 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
@@ -28,6 +28,17 @@
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
 
+namespace paddle {
+namespace framework {
+class VarDesc;
+namespace details {
+class ComputationOpHandle;
+class ShareTensorBufferOpHandle;
+struct VarHandle;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -71,6 +82,8 @@ namespace ir {
  * a pass to clean all ShareTensorBufferOpHandles and move sharing to
  * ComputationOpHandle::Run() in the future.
  */
+class Graph;
+
 class MemoryReusePass : public Pass {
  protected:
   void ApplyImpl(Graph *graph) const final;
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
index 11c2508afb5747b6f0f3bba06c68448fef7d384a..7de62d6e48249203ce22a8aaccc2f9b68b16c54c 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
@@ -13,8 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-#include <queue>
-#include <utility>
+
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
index 5fb2caedba85d2892e18db5e84067c2d2ebada6e..d6f286afc559064fc97ddc7b936651878fa3a0d0 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
@@ -18,8 +18,17 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
index ed87f73adf137fdf545209f36f996417031fcda4..e85be0272de7fc1eb997f0ff9385e41dc8514680 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
index 0e8f4e78d22db0aad8a8120b10ee916ade21829d..d00e4f53022f4c8084438fa9f36a58685672c30d 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
@@ -30,6 +30,9 @@ namespace paddle {
 namespace framework {
 
 class VarDesc;
+namespace details {
+struct VarHandle;
+}  // namespace details
 
 namespace ir {
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index 45ff275d530857690d1f169bbcf60a99952ae2c2..c33398553ecd2cbe291e9cc605aa23ce318e9efe 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -13,14 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-#include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
@@ -96,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
 
 REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
               paddle::framework::ir::ConvActivationFusePass);
+REGISTER_PASS_CAPABILITY(conv_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("relu", 0));
 
 REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DLeakyReLUFusePass);
+REGISTER_PASS_CAPABILITY(conv_leaky_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .LE("leaky_relu", 1));
 
 REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DReLU6FusePass);
+REGISTER_PASS_CAPABILITY(conv_relu6_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("relu6", 0));
 
 REGISTER_PASS(conv_swish_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DSwishFusePass);
+REGISTER_PASS_CAPABILITY(conv_swish_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("swish", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
index ac15fc0451285d4d5575dbc08f430625912ac823..be6b1e07c022b718b1998b89044589ad2d3567d2 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Fuse Conv and Activation base class.
  */
+class Graph;
+
 class ConvActivationFusePass : public FusePassBase {
  public:
   virtual ~ConvActivationFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index f7a8e3e3f6c3c77e978c57eeb7515d8cfce86471..63524294b68ef1199d49bb9deeaf80f5526c6eff 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
 #include <functional>
-#include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index 833fbc748ebd03377ebaa6a5fa72d334ff8b7d37..9a83310ebfb558f4744ae508155d8aa8d01a39c7 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -13,16 +13,20 @@
 // limitations under the License.
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
 /*
 * Fuse the Conv and Elementwise_add to a ConvBiasOp.
 */
+class Graph;
+
 class ConvBiasFusePass : public FusePassBase {
  public:
   virtual ~ConvBiasFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
index af64cb22054e9f2ea751bb993a39e8be563ae458..76e102125501144cbfd06ced2c88b4f1e02e261b 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -14,12 +14,15 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void ConvConcatReLUFusePass::FindConcatWithConvs(
     ir::Graph* graph,
     std::unordered_map<const Node*, int>* concat_with_convs_counter) const {
@@ -121,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass,
               paddle::framework::ir::ConvConcatReLUFusePass);
+
+REGISTER_PASS_CAPABILITY(conv_concat_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("concat", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
index 91ff0760f0483c41cb5be5507426290c90142b13..f1faa84f3d59b736b35ee2c206976c899d3366bf 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,6 +31,8 @@ namespace ir {
  * to a:
  * (multi ConvReLU) -> Concat -> next_op.
  */
+class Graph;
+
 class ConvConcatReLUFusePass : public FusePassBase {
  public:
   virtual ~ConvConcatReLUFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index b95aec34d30745d99f6066e36f19c883927e2b53..2ba4c80678f0890b05c6d4c9822d8c5c9a032dc4 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -28,6 +28,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class GraphPatternDetector;
+class Node;
+namespace patterns {
+struct Conv;
+}  // namespace patterns
+
 using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index aa0979b4be64ae3ccebbd7cc82abcf4a4712527a..0254b5e75735152e4720116e8d47b998608f1a01 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -13,12 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
-#include <limits>
+
 #include <sstream>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/platform/errors.h"
+
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 21219e7dca8c712a09650779f7ef803052a85748..bd87b31b781ec6217f3b084990fb22390b08925e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -31,6 +32,9 @@ namespace ir {
  * bool denotes whether quantization of the variable should be done to unsigned
  * type.
  */
+class Graph;
+class Node;
+
 using VarQuantScale =
     std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index bc268a834780cad843a18a74bb7f50a639db103d..2146d833ddf82d4086f5036f0148e87d51062431 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
-#include <string>
 #include <unordered_set>
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   const auto& excluded_ids_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
index f3229e59d6ffb97514adb9c871d4fb981fc964e0..474fa63f60e413132421c6cc06942413658388ab 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Specifies which operators should be quantized.
  */
+class Graph;
+
 class CPUQuantizePlacementPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index bc24c10d9d0ae545d0dc71160d66e02a9fdbd730..54ab244a99bd4cd555c37b47558b4492d46862f9 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -14,9 +14,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
-#include <algorithm>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -24,6 +25,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 using string::PrettyLogDetail;
 
 void CPUQuantizeSquashPass::FindNodesToKeep(
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
index 98a518e4e532bb250459448e864a4fb89d55686f..d1465f9da5cc2f7973cf67c1d455984e475d79e2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,6 +30,8 @@ namespace ir {
 /*
  * Squash dequantize->quantize pair pattern into requantize op
  */
+class Graph;
+
 class CPUQuantizeSquashPass : public FusePassBase {
  public:
   virtual ~CPUQuantizeSquashPass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index df5ba3314e637fefe930d4c45f431314dd7d8493..b2c0afdc754fb7aa3b3ffaf09e5b1961c080bcd6 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -20,6 +21,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 #define GET_NODE(id, pattern)                                     \
   PADDLE_ENFORCE_NE(subgraph.count(pattern.RetrieveNode(#id)), 0, \
                     platform::errors::InvalidArgument(            \
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
index ca314afde57bbc5a339b2016a2540309b31f0598..0f4ecc71ad72020b089821a0cadc4156718230e8 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class DepthwiseConvMKLDNNPass : public FusePassBase {
  public:
   virtual ~DepthwiseConvMKLDNNPass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
index 95afc5483763d11c07e7d316d9f5aa29f97172b8..6efa9f6b74995450f6758200acc414e1aa2bef7f 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
@@ -13,18 +13,21 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(graph,
                           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
index 97c6b242989591ccf24e52a969bfcedc4f377c3f..df02250394a191b9e863e2c0bd73578b235eb79d 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -25,6 +26,8 @@ namespace ir {
 /*
  * Transpose weights of FC to comply with MKL-DNN interface
  */
+class Graph;
+
 class FCMKLDNNPass : public FusePassBase {
  public:
   virtual ~FCMKLDNNPass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
index 37c14e1d8e3b90f223c8dff7396d96594b9286d7..41b859f0af665eae6d9ccb6a08cd29db5ce67fdf 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
 #include <paddle/fluid/string/pretty_log.h>
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(matmul_transpose_reshape_fuse_pass,
               paddle::framework::ir::MatmulTransposeReshapeMKLDNNPass);
+
+REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul", 0)
+            .EQ("transpose", 0)
+            .EQ("reshape", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
index 77e30b353467c7baca7baaac80b56e47ffef81ef..ef469bac40c4edbc524ef4b24c8df932819f0a3a 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -22,6 +23,8 @@
 namespace paddle {
 namespace framework {
 namespace ir {
+class Graph;
+
 class MatmulTransposeReshapeMKLDNNPass : public FusePassBase {
  public:
   virtual ~MatmulTransposeReshapeMKLDNNPass() {}
@@ -30,6 +33,6 @@ class MatmulTransposeReshapeMKLDNNPass : public FusePassBase {
   void ApplyImpl(Graph* graph) const override;
   const std::string name_scope_{"matmul_transpose_reshape_fuse"};
 };
-}
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
index de699430189bfab7ccc16f0651781304bc36f8da..ca56a8900ca4f7edac7be095a0968555bf628124 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/placement_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
index eab9f095623029e79a0d0f9b2697f45ac30bf3a9..7a53b3c498413e43eea7b2e4697791d36fed1149 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -25,6 +26,8 @@ namespace ir {
 /*
  * Fuse Reshape->Transpose->MatMul when MatMul uses mkldnn.
  */
+class Graph;
+
 class ReshapeTransposeMatmulMkldnnFusePass : public FusePassBase {
  public:
   virtual ~ReshapeTransposeMatmulMkldnnFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 6c87e437caa1b159c889a68b4d6f5b1790217ca1..0784a1a024cfd31cfb2d2a3ea205518416c2ad13 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -13,15 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 using string::PrettyLogDetail;
 
 void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
@@ -86,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(scale_matmul_fuse_pass,
               paddle::framework::ir::ScaleMatmulFusePass);
+
+REGISTER_PASS_CAPABILITY(scale_matmul_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("scale", 0)
+            .EQ("matmul", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
index fe97b9681ce7b1c489833baa96187a50e595af3d..32ff78d9a73683c700ceb31a1505538ff7ee6119 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ScaleMatmulFusePass : public FusePassBase {
  public:
   virtual ~ScaleMatmulFusePass() {}
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
index a89616683d9c625111272fd8c1de237a5c9dbe8f..ae2e68c6003a1ce2e953c1231792e674ae7cafd7 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
@@ -31,6 +31,8 @@ namespace ir {
 // sync training, we can simulate even large batch size as if we have more
 // GPUs.
 
+class Graph;
+
 class BatchMergePass : public Pass {
  public:
   virtual ~BatchMergePass() {}
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index bb6c80790746ee16e39ef8f7e251d54ac563242a..bb3586ba8048081f8b990e9e7eb6c85c3f6e1026 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -20,17 +20,32 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 class NCCLContextMap;
+class NCCLCommunicator;
 }
 
 namespace framework {
 class Scope;
+
 namespace ir {
 
 constexpr char kLossVarName[] = "loss_var_name";
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
index bcbd1e066cc1fd056f7de018a697fb842ad195eb..78e90f82bfbef344d4c8514838ce7dc99c474ebf 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
index 0038790cae245805652ff58148cda2fea94b613b..d37b21f058434e8d46e0f00ad8613f353ccdfb42 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
@@ -17,10 +17,18 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void InitReaderQueueDeviceCount(Graph *graph, const Scope &scope,
                                 size_t dev_cnt);
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 9d2b4ebaf8ccf33e175e46c08657e7eeed467055..d1fbc8396ba55523f3769a26ceaf9ef4e7fcf65e 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"
-#include <memory>
+
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/ddim.h"
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
index 0afa00fc62aa79c8a63350bc63cfe464999ca0e4..f5327dc71080be9edff30855a157465e0b35712a 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
@@ -16,10 +16,19 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 45d81b937392244f678fbd01395b3ffffd07f710..7143c9a7a3e86c88920ebfc66f90650e68505c74 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 87e7e64acb71a5059b2f3bf1539ff281ac322774..d0db3bd36e13fa69d6c549916adc5ca1365784c4 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -19,10 +19,18 @@ limitations under the License. */
 #include <typeindex>
 #include <typeinfo>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index a5ca13f1ce252d2368e2fc765e49d397356660a7..0e5f5867f47b25f3efdcf648c4243cec310ad4ca 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
 
-#include <memory>
-#include <utility>
-
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 0f5ef551f044d9e53b04b6efad3954d1a48a0ac3..668dc74eab20a17d3697ebe778a1a5bb63cdab48 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -29,6 +30,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+class Graph;
 template <typename PassType>
 struct PassRegistrar;
 
diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc
index 6457bd230c59cfebd19ab7951b2c04a1890e3fce..4e99271a2ec1e5d4a0c2f28e91939ce7f5855d37 100644
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Pass;
+
 std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
   VLOG(1) << "Append " << pass_type;
   auto pass = ir::PassRegistry::Instance().Get(pass_type);
diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h
index 733d3a3ad1ab8989ea30fe45cd7e1ffe9432de13..0e68767db3fa8e49d946d523043cc57c32b0ee99 100644
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Pass;
+
 class PassBuilder {
  public:
   PassBuilder() {}
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 0c5286b3f77e10876b0240e1245ca343471770d5..65b9c427869ee52572beb6e1696279e9892286ce 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
-#include <memory>
+
 #include <string>
-#include <utility>
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
+class Graph;
+class Node;
+
 void BuildCircleGraph(Graph* g) {
   ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
   ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
diff --git a/paddle/fluid/framework/ir/placement_pass_base.h b/paddle/fluid/framework/ir/placement_pass_base.h
index 5cdd7963e54b9668165fd5e71d8164d71206ecaf..ef1a920db3fd169904d4ebbd8fe0635444d17bd7 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.h
+++ b/paddle/fluid/framework/ir/placement_pass_base.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
@@ -25,6 +26,8 @@ namespace ir {
 /*
  * Specifies which operators should use cuDNN.
  */
+class Graph;
+
 class PlacementPassBase : public Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 9f6032ffa5b87daece107ad6bd3d5f9444719e44..af4a2f4060572fdffbfaa9fdf2647aa025df9957 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
-#include <algorithm>  // for max
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
index ae777bccebec9f99b4752fe495f96d3da38aac23..0be217cc748a248f4e5bf8d98922cb8ebdbd3e3c 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /**
  * Fuse Repeated FC Relu
  */
+class Graph;
+
 class RepeatedFCReluFusePass : public FusePassBase {
  public:
   virtual ~RepeatedFCReluFusePass() {}
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
index e4783166e0cbde0be9037df5afe3e903a40a2065..741adcce8d76713001d1c2385d7acf8855092d3e 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
@@ -15,12 +15,15 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class RuntimeContextCachePass : public Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index 19ec2d818a3db5140031287618f054f8468970fe..4101d593086cdbf8848034cd478e068c95d8f790 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -16,10 +16,8 @@
 #include <set>
 #include <string>
 #include <unordered_set>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -258,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(seq_concat_fc_fuse_pass,
               paddle::framework::ir::SeqConcatFcFusePass);
+REGISTER_PASS_CAPABILITY(seq_concat_fc_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("sequence_expand", 0)
+            .EQ("concat", 0)
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("sigmoid", 0)
+            .EQ("tanh", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0)
+            .EQ("fusion_seqexpand_concat_fc", 0));
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
index d68840a554777e64082f7f9e467221bc0948d9dd..a70411536455757b49292e990d27e372651b88c9 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -22,6 +22,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class SeqConcatFcFusePass : public FusePassBase {
  public:
   virtual ~SeqConcatFcFusePass() {}
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 75ab04f1b9130dccd42cea39dc0e074e2e2838eb..d9a1348e05a1291cb1bf8e398c187afaebecfca6 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
index fde9b586c85712b14d285cec49f9e09efad78fc7..6f623625f51d8217370f2eabfb6820eebeb6e07a 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -23,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class SeqConvEltAddReluFusePass : public FusePassBase {
  public:
   virtual ~SeqConvEltAddReluFusePass() {}
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
index 1c220ee4d571815eaf26255db2c519dc4821068c..b6badf745c6bd5a972f3aef438517015228fe612 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -16,7 +16,14 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 
 #define MAX_CONCAT_INPUTS 200
 
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
index 40a9edc5e642320996f5bd3451479fe347f24081..482fd5cb5802d5e685dcf0eb41f23bd4d66b40db 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -37,6 +38,8 @@ namespace ir {
  *   FusionSeqPoolConcat
  *           |
  */
+class Graph;
+
 class SeqPoolConcatFusePass : public FusePassBase {
  public:
   virtual ~SeqPoolConcatFusePass() {}
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
index 8261bfc15348f90af4ed7acb9e5b68373dc5e715..d639d410466d947caeb32cada32c185d376a61c1 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
@@ -13,16 +13,17 @@
  * limitations under the License. */
 
 #include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
-#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 namespace {
 static PDNode* BuildCVMConcatPattern(PDPattern* pattern) {
   auto cvm_behind_x = [](Node* x) -> bool {
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
index 88a41983c6bf7b4e76d7912dbb3821b2c2ed533b..b0a3573fb59f975400e43b6ae842c23ad262e2ff 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -39,6 +40,8 @@ namespace ir {
  * FusionSeqPoolCVMConcat
  *           |
  */
+class Graph;
+
 class SeqPoolCVMConcatFusePass : public FusePassBase {
  public:
   virtual ~SeqPoolCVMConcatFusePass() {}
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index 8bdf3940928c768fc7b0a9c7fa3d084d95f60859..92e995579fa9e5594718c88586322038b1688047 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
index 008f8013efd28b3cdc5a846662653e07e45e3985..dc375988cdd5d42b1e1a030952fbf8ecb4fdc675 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
index 2e5c18d3352bc655756522b32cbfd92f7ad4b4cb..5cc6b6171ac3be3951c550fb160115c982dbdb7d 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -30,6 +29,8 @@ namespace ir {
  * - remove dropout_op (upscale_in_train) or
  *   replace dropout_op with scale_op (downgrade_in_infer) when is_test is true
  */
+class Graph;
+
 void SimplifyWithBasicOpsPass::ApplyImpl(Graph* graph) const {
   VLOG(3) << "Simplify the Graph with basic ops.";
   std::unordered_set<const Node*> del_node_set;
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
index f5185622468055939103876387662f6402a45bfe..6a245c444a7ec8dd800d8432693d2fa247360634 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
@@ -16,12 +16,16 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 class SimplifyWithBasicOpsPass : public Pass {
  protected:
   void ApplyImpl(Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index 2e3cd16d5ce49fdd6186f98c72d77c75c4053559..e5f348dfeb13e97632aa4901b6109576a21f67af 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
-#include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
index 2de8d3762219ffc0ff5500996f53dd8e3117dc61..3a3e50052396a538aebb9027cb444b819129af95 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
@@ -29,6 +29,8 @@ namespace ir {
 //                 |                                   |
 //             other_op3
 //                 |
+class Graph;
+
 class SkipLayerNormFusePass : public FusePassBase {
  public:
   virtual ~SkipLayerNormFusePass() {}
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index 56b7ec9b84314bd3634c406c31e20dd421f7fa92..90def957df4bf0907a306798fbb1e9ba53c37919 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /**
  * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
  */
+class Graph;
+
 class SquaredMatSubFusePass : public FusePassBase {
  public:
   virtual ~SquaredMatSubFusePass() {}
diff --git a/paddle/fluid/framework/ir/subgraph_detector.cc b/paddle/fluid/framework/ir/subgraph_detector.cc
index 7979953d7be827ffc944ae939782923504802bbc..6ebe900e26baad40d69beeb082e76b072a863621 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
-#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 std::pair<std::vector<Node *>, std::vector<Node *>>
 ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
diff --git a/paddle/fluid/framework/ir/subgraph_detector.h b/paddle/fluid/framework/ir/subgraph_detector.h
index 3d279e27e6ae97978e46f7282f58d4a4ca1efdde..6bd73c758b377c9653b23b4ac4ad3b08fb1aea8f 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.h
+++ b/paddle/fluid/framework/ir/subgraph_detector.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/node.h"
@@ -24,6 +25,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 const char kIsFunctionNode[] = "__is_function_node__";
 const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
 const char kSubgraphSplitterMarkerAttrName[] =
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
index 222c73761b4f5d960b1efafdb2e4368c17519fec..3fa008c300c5c8d0741bcdb1c4b3ef2dc8b0c0fc 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
@@ -12,15 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-#include <string>
-#include <utility>
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class SyncBatchNormPass : public Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override {
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 405cefa99ebbbe147fc96f63567e13607732780e..2db6d0230e3f9265be8a6d150f75ad5c04b3fef0 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
index 8c6e8b0c66ead96f0e53b56ee951887730b0d77f..9a7c1285e305cf3527265f75db0d3b5c02d5f507 100644
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <iosfwd>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 40615d772e555bb9e2ac44a6339de9f3be3c9562..a044812dd3153b5752201ed57513e04cef8aecf4 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,19 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/lod_tensor.h"
 #include <stdint.h>
-#include <string.h>
 #include <algorithm>
-#include <iterator>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/version.h"
 
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/memory/memory.h"
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index da97efb616840b6663677475c4ca5dab68d7ccfe..e09a628f491604e74449965ccc3e19a25a78e66d 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #endif
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -31,6 +32,15 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index c93c3f2673b1d80ef1e1a9dd68ad50501ba16f42..e3223e67fc94df883dc4ef2fcea3a6ecb7ef9512 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -14,9 +14,6 @@
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 280996d34dd73e067e4e42848ea52dbbd6745caa..3a6e80f718d18f392e0548d883c13288ef38eb47 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -20,14 +20,14 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
+
+#include "glog/logging.h"
 #include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
index 0599c8d384641606b0a5ebb5ba1781b56f539e63..a40a3ff33fe354fd50b2e9cc24dd6a73da01a62a 100644
--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -12,11 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <memory>
-
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/mixed_vector.h"
 
 template <typename T>
 using vec = paddle::framework::Vector<T>;
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 81402a650a3e334e273c18b279c241282ac5bf1f..75677ef5243da698caff0788a200c206b47dc875 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -14,8 +14,10 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -28,6 +30,10 @@ namespace framework {
  * Simple, intuitive and effective. Only single thread is supported, and
  * currently designed for inference.
  */
+class LoDTensor;
+class ProgramDesc;
+class Scope;
+
 class NaiveExecutor {
  public:
   explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h
index 5d30f34090e230f1766a38992674dd9d0dc9a137..21ba0381fe6ccfc9b5643e2963828b816ea6d96d 100644
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
index d48cf27285a0a5040b5e375a27ccf6a8b00bd8c0..f633538e700b242469bce6d76dfb58e89f9cdbe8 100644
--- a/paddle/fluid/framework/op_call_stack.h
+++ b/paddle/fluid/framework/op_call_stack.h
@@ -15,9 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace platform {
+struct EnforceNotMet;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h
index 08b5734b5bfe33d4269c06d639448eefd26fcb06..01fbdef99cbbc287d0d473b48fd42ca0706445af 100644
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
@@ -14,6 +14,7 @@
 
 #include <map>
 #include <string>
+
 #include "paddle/fluid/framework/program_desc.h"
 
 #pragma once
@@ -21,6 +22,10 @@
 namespace paddle {
 namespace framework {
 
+namespace proto {
+class OpCompatibleMap;
+}  // namespace proto
+
 enum class OpCompatibleType {
   compatible = 0,       //   support previous version
   DEFIN_NOT = 1,        //   definitely can't support previous version
diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc
index 43959c8b2abd868a705b5e21950cdca2faaebe5c..98f3f5071ad285feaadcdf2b13a3c756477b2f18 100644
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ b/paddle/fluid/framework/op_compatible_info_test.cc
@@ -13,14 +13,17 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/op_compatible_info.h"
-#include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
 
+namespace proto {
+class OpCompatibleMap;
+}  // namespace proto
+
 TEST(test_op_compatible_info, test_op_compatible) {
   auto comp_map = OpCompatibleMap();
   comp_map.InitOpCompatibleMap();
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index e15f0012fdc2ebfecec8daebbd3c04b917cd7a84..95c33bca6c7f1df6ad71a3b4c2f82d726cafb5fc 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_desc.h"
@@ -27,6 +28,7 @@ namespace framework {
 
 class BlockDesc;
 class ProgramDesc;
+
 class OpDesc {
  public:
   OpDesc() {}
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index c815e194d43e149f9efe0daec820c42e87f81d0c..820a83586b31701cf0cc4e6b968be2fd48fe2f96 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_info.h"
 #include <set>
-#include <string>
-#include <vector>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 89b499975790060a3a3e3f665c35f8545922e6a7..af657232e91a68aa26ab85faf63acdd1b8f191d1 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -20,13 +20,18 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
 
+class InferShapeContext;
+class OpAttrChecker;
+
 class InferShapeBase {
  public:
   virtual ~InferShapeBase() = default;
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index 9edc1a3e150027b5a3dbd8483dc8b58d1d4ab918..f4e60bb9b78547e84941a1caf60399044ec79143 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/library_type.h"
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index 40db85400d2c8776b82ce0fa2fb4deed993b0255..2979750fba792495e143e070523472a192ee12de 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_kernel_type.h"
+
 #include <gtest/gtest.h>
-#include <iostream>
 
 TEST(OpKernelType, ToString) {
   using OpKernelType = paddle::framework::OpKernelType;
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index a8030d377fdb4d4aef74b315e21792dad10fac96..56f940e399776a08626863d6f2b72ef37a294233 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -16,6 +16,12 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+namespace paddle {
+namespace platform {
+struct EnforceNotMet;
+}  // namespace platform
+}  // namespace paddle
+
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
index 81cfaf92e391de28356dec05fd7b94cce3e51440..72dd6fa6bbd139c489ccea62ba46c73772f72033 100644
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <vector>
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 6408fadf90ae32adf048156d1369cf22a76d20ea..77383386fa11d5bb297ac5636c8ec122a69a7e6b 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -35,6 +35,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/shape_inference.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index 239dbc4357854a8962567129b259a64260308b49..d6b18751cefe56c75da8d0f6d6070b849e872bfc 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bd52d7ffef5040f596bfb5ca9521a6e1062bb5aa..d493f350e69736fddc2cbda56a8e3967235bce8a 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -39,6 +39,15 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpInfo;
+class Scope;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 DECLARE_int32(inner_op_parallelism);
 
 namespace paddle {
@@ -105,8 +114,8 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) {
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
-class OperatorBase;
 class ExecutionContext;
+class OperatorBase;
 
 class RuntimeContext {
  public:
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index d5e9c755f1298bcd31ea9093b5912a46133d0228..5cafc9111da6734405b2150328f3f356c92c69fd 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/proto_desc.h"
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 48bde2785e6a51afc0d2905ac31fe20a3c3019b6..0ba1099b03231443d59c0dc820af140ab6ab4053 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/program_desc.h"
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
 
 namespace paddle {
 namespace framework {
+class VarDesc;
+
 TEST(ProgramDesc, copy_ctor) {
   ProgramDesc program;
   auto* global_block = program.MutableBlock(0);
diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h
index 63e8aaad73575e7561428a9de43b86f75c22e65e..57f282536bf46e374cab24339790a715f4669e7b 100644
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <tuple>
+
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 12fa0c61f8121d475a0cf2aa78e4bb995a01b132..618eaba3c5b8bb09a4e413b32db62f3a80d3c557 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -15,13 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 
 #include <gtest/gtest.h>
-#include <set>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/operator.h"
-
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 9f4c817db7d81ff6d8a9afb9d5cee7b4f1dd0ed2..6aeef8a39b53342a1c7eb99ba0892bda29a8fbcd 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <time.h>
+
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Scope;
+class Variable;
+
 std::shared_ptr<PullDenseWorker> PullDenseWorker::s_instance_ = NULL;
 std::mutex PullDenseWorker::mutex_for_version_;
 std::map<uint64_t, uint64_t> PullDenseWorker::last_versions_;
@@ -58,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #ifdef PADDLE_WITH_CUDA
   copy_streams_.clear();
+#endif
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   places_.clear();
   thread_scopes_.clear();
 #endif
 }
 
 void PullDenseWorker::CreatePinVar() {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -79,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
+#ifdef PADDLE_WITH_CUDA
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
+#endif
+#ifdef PADDLE_WITH_XPU
+      pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+#endif
     }
   }
 #endif
@@ -103,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
     exit(-1);
   }
   status_vec->resize(0);
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
 
   for (size_t i = 0; i < places_.size(); ++i) {
     // for (auto& v : dense_value_names_) {
@@ -121,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         LoDTensor* tensor = var->GetMutable<LoDTensor>();
         float* w = tensor->data<float>();
+#ifdef PADDLE_WITH_CUDA
         memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
                      platform::CUDAPinnedPlace(), pin_w,
                      sizeof(float) * tensor->numel(), copy_streams_[i]);
+#endif
+#ifdef PADDLE_WITH_XPU
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
+                     platform::CPUPlace(), pin_w,
+                     sizeof(float) * tensor->numel());
+#endif
       }
     }
   }
@@ -144,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
     uint64_t tid = static_cast<uint64_t>(
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
                                      &pull_dense_status_, false);
diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc
index 16f9cbb65229f10912ee90436c3557aaaca169b8..601b10787bea15cc96b47c051f6c3f42df5697e5 100644
--- a/paddle/fluid/framework/rw_lock_test.cc
+++ b/paddle/fluid/framework/rw_lock_test.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/rw_lock.h"
 #include <gtest/gtest.h>
-#include <chrono>  // NOLINT
 #include <thread>  // NOLINT
 #include <vector>
 
diff --git a/paddle/fluid/framework/save_load_util.h b/paddle/fluid/framework/save_load_util.h
index 6b50c93ddbd2be8aba65cbdd8942278a2e1af3a2..0b96e002e30acf986ee853e49ba9b70cf2a47d27 100644
--- a/paddle/fluid/framework/save_load_util.h
+++ b/paddle/fluid/framework/save_load_util.h
@@ -28,6 +28,8 @@
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 bool SaveStaticNameListToDisk(
     const std::string& file_name,
     const std::vector<std::string>& vec_tensor_name_list, const Scope& scope);
diff --git a/paddle/fluid/framework/save_load_util_test.cc b/paddle/fluid/framework/save_load_util_test.cc
index 4a54e2d46684936bf32d8b87181dd746fd1b142c..10a34d7ce91ad27a271b8a112a1af1e39da52e31 100644
--- a/paddle/fluid/framework/save_load_util_test.cc
+++ b/paddle/fluid/framework/save_load_util_test.cc
@@ -13,12 +13,9 @@
 // limitations under the License.
 #include <stdlib.h>
 #include <time.h>
-#include <iostream>
-#include <memory>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/save_load_util.h"
-#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index db7010ecceb3e7c39cdfd78c5e82074dba199fc7..922e9a9b272727ec0eaaf3c8dea5172552af95d1 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -30,6 +30,12 @@ extern "C" {
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc
index 4bb077a2c52f0853b01a7af30308a0fa737500cb..cf0b3ebcddd2fbc86d54cd732cef96871b8592fd 100644
--- a/paddle/fluid/framework/scope_pool.cc
+++ b/paddle/fluid/framework/scope_pool.cc
@@ -11,10 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <memory>
-
 #include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/threadpool.h"
+#include <memory>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h
index a8b468699abe148d44a395cf888158aefab4380b..19faa9aa6a45fc0212dd212304bfb6603c9791a8 100644
--- a/paddle/fluid/framework/scope_pool.h
+++ b/paddle/fluid/framework/scope_pool.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <memory>
 #include <mutex>  // NOLINT
 #include <unordered_set>
 #include "paddle/fluid/framework/scope.h"
@@ -21,6 +22,8 @@
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 class ScopePool {
  public:
   static ScopePool &Instance();  // NOLINT
diff --git a/paddle/fluid/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc
index 26817fc558dfab6926b67ee744b0a2ef548b4ffb..a61e68279a2d30f7128547aab2d7a5fab2c3f06b 100644
--- a/paddle/fluid/framework/scope_test.cc
+++ b/paddle/fluid/framework/scope_test.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/scope.h"
-#include "glog/logging.h"
+
 #include "gtest/gtest.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 using paddle::framework::Scope;
 using paddle::framework::Variable;
 
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 1f402ea9dd33626a43a4d03b96256b2c2841c8b4..4c30c40ad58375fb08f23e2c7bdef27fdaea7384 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/selected_rows.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 285af1d55302a49cae058fccdd5edd13aa28137e..48353b43f56cacbb71512b9e743af281b09fc531 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -26,9 +26,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+class Place;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
+class Tensor;
+
 class SelectedRows {
   /*
    * @brief We can use the SelectedRows structure to reproduce a sparse table.
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index f5bb3f68007043ad37ea32e7047c5fc546b80931..02e4ce914b887f07e1e34cfbda533f71cec6bc06 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/shape_inference.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/grad_op_desc_maker.h"
-#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 0b22bab26789a3e2ebd20428adc236faa8b38dee..9f5d8d30c9cdea0cfa9db1793e18874bcd42aae0 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index f2ccff2c133a238d02e25c65faf41dd519fdb506..faecba6295d35fd9b6d841181a450a5a540a7074 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <typeindex>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -28,6 +29,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
 namespace paddle {
 
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index cc972dd93d032c19015c86debebc27f7c8c0d155..92a29d5165ce790bbbc9673979c5994317bb3dfb 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/tensor.h"
+
 #include <gtest/gtest.h>
 #include <string>
-#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 0e3d11b9f0257905cbede334afd0ad84ff15cb5c..a073dbd733f0b1f1bb1b94221b99d477cb8dfadb 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -664,7 +664,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
     uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
 
     auto* data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE_LT(size, std::numeric_limits<std::streamsize>::max(),
+    PADDLE_ENFORCE_LT(size, (std::numeric_limits<std::streamsize>::max)(),
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index a0408dbc3dbb4ffca70ef322d93b662f1b953f7b..1e811a41e90af5f35562eae682669292922a6d69 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -30,6 +31,8 @@ namespace framework {
 // If ctx_place and src_place are the same, src_ctx.Wait() is added
 // after memory::Copy; if ctx_place and dst_place are the same,
 // src_ctx.Wait() is added before memory::Copy.
+class Tensor;
+
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
 
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index ef9964b3ae75fd65ded0fb5d94f9a97d06c138d5..e389cb34679a2459b547497a46e97586c7cd72bd 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/tensor_util.h"
+
 #include <gtest/gtest.h>
 #include <cmath>
-#include <string>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 4682bfc264b68997abd0a87233c5ed39e7e50a63..3db8f3e36b7fbdac140487d00476f1c493a4b900 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -13,8 +13,6 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/threadpool.h"
-#include <memory>
-#include <utility>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 09528b6fc35bf49ac3110440a62aba3200341e15..7fecf07475b1457bf6d19279493517c55f057194 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 077fe75172022c8fe501bd1143895115298417bf..ecaec49aa461cd6134cb60b7af7adb50f1a94686 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -14,14 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include <ctime>
 #include <fstream>
+#include <map>
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
-#include <ctime>
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker.h"
@@ -38,6 +39,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Dataset;
+class LoDTensor;
+class ProgramDesc;
+class PullDenseWorker;
+class Scope;
+class VarDesc;
+template <class T>
+class ChannelObject;
+
 class TrainerBase {
  public:
   TrainerBase() {}
@@ -128,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
 
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 class HeterServiceContext {
  public:
   HeterServiceContext() {}
@@ -141,7 +152,9 @@ class HeterServiceContext {
   void Reset() { push_dense_status_.clear(); }
   int place_num_;
   Scope* scope_{nullptr};
+#ifdef PADDLE_WITH_CUDA
   cudaEvent_t event_;
+#endif
   std::vector<OperatorBase*> ops_;
   std::vector<::std::future<int32_t>> push_dense_status_;
 };
@@ -168,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
   virtual void CacheProgram(const ProgramDesc& main_program) {
     new (&program_) ProgramDesc(main_program);
   }
+  virtual std::string GetDumpPath(int tid) { return ""; }
+  virtual void InitDumpEnv() {}
   template <typename T>
+#ifdef PADDLE_WITH_CUDA
   void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
                    cudaStream_t stream);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
+                   const paddle::platform::Place& thread_place);
+#endif
   void CreateThreadParam(const ProgramDesc& program, int num);
   template <typename T>
   void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
@@ -197,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<std::string> op_names_;
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
-  std::vector<cudaStream_t> copy_streams_;
   std::vector<platform::Place> places_;
+#ifdef PADDLE_WITH_CUDA
+  std::vector<cudaStream_t> copy_streams_;
   std::vector<cudaEvent_t> events_;
+#endif
 };
 #endif
 
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 31ac11e78cff15f12660b84b96a007690aa77ae3..cc92c50cc428a59905ce3864a0b89f591d1b2390 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/framework/trainer.h"
-
 namespace paddle {
 namespace framework {
 
+class TrainerBase;
+
 typedef std::shared_ptr<TrainerBase> (*CreatetrainerFunction)();
 typedef std::unordered_map<std::string, CreatetrainerFunction> trainerMap;
 trainerMap g_trainer_map;
@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 #endif
 #if defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/framework/trainer_factory.h b/paddle/fluid/framework/trainer_factory.h
index 9c772a4f19ed9ba50f704ed62ef361555b1285fb..3ef61c03817523d153d966102ca43a30bfdcbb6b 100644
--- a/paddle/fluid/framework/trainer_factory.h
+++ b/paddle/fluid/framework/trainer_factory.h
@@ -16,11 +16,14 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/trainer.h"
 
 namespace paddle {
 namespace framework {
 
+class TrainerBase;
+
 class TrainerFactory {
  public:
   static std::string TrainerTypeList();
diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h
index 9a5d45263219adacddb9e9280feec40cfb1ff903..7ff6020ff2abf1f666ed6332d3014caece072cdd 100644
--- a/paddle/fluid/framework/transfer_scope_cache.h
+++ b/paddle/fluid/framework/transfer_scope_cache.h
@@ -17,12 +17,16 @@
 #include <thread>  // NOLINT
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
+class OpKernelType;
+class Scope;
+
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
 
 std::unordered_set<Scope*>& global_transfer_scope_cache();
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index 508ee931c6ed7f66e09abd8f0e4b33c3d3c135fd..35b8220d453addb14a9947cd0828e6c4208fe563 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
diff --git a/paddle/fluid/framework/tuple_test.cc b/paddle/fluid/framework/tuple_test.cc
index cfdd4dc56e4c006f6c614bda143d5cc11e666732..9060bd3fc89bd8ca2aa7e05a09b0494ceabd3eaa 100644
--- a/paddle/fluid/framework/tuple_test.cc
+++ b/paddle/fluid/framework/tuple_test.cc
@@ -11,11 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <sstream>
-#include <vector>
-
-#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tuple.h"
+#include "gtest/gtest.h"
 
 TEST(Tuple, Make) {
   std::vector<paddle::framework::ElementVar> element_type;
diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h
index ab17641080551166f246c69d42ed85df3892a3e0..a9c047cc6c6acfa5df157389fcd6055957c7bf4c 100644
--- a/paddle/fluid/framework/unroll_array_ops.h
+++ b/paddle/fluid/framework/unroll_array_ops.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cstddef>
 #include <type_traits>
+
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc
index be811478eec17e0986ae7579ff323d94dea3155a..c4fdfdb425f23eb0e1aef3f2414b26afd421bac4 100644
--- a/paddle/fluid/framework/unroll_array_ops_test.cc
+++ b/paddle/fluid/framework/unroll_array_ops_test.cc
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/unroll_array_ops.h"
+
 #include <gtest/gtest.h>
-#include <algorithm>
 #include <array>
-#include <cstdint>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index e7e964b41818597404d9a6e510c0743f4f8c3f7e..ac455b9ffd7c1231003b337e47a3fa17ec052b9a 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-
 #include <string>
-#include <unordered_set>
 #include <vector>
 
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/unused_var_check.h b/paddle/fluid/framework/unused_var_check.h
index 2f44a3bcde0bc2e6fe8b33247444166728d03ad4..d78b4d928f3ec85fe768f2de1316111533579bec 100644
--- a/paddle/fluid/framework/unused_var_check.h
+++ b/paddle/fluid/framework/unused_var_check.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-
 #include <string>
 #include <unordered_set>
 
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 2ee0b17b64b6df7a2f66b208f5b5879683db6656..457c0c77b3c427501eebdf27b4d01d47b116c519 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <google/protobuf/util/message_differencer.h>
-
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index 6e8be0fdd42e038bf55bd0c63e47fe3cb006cbf4..b37a09a17a985c29d92e0f5dafc7db16cdafc9fc 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/framework.pb.h"
 
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 9312ac075dec3e240e6fa56a632b50509c0c2632..f649c9388f0f6518dc4f8a587f5c9f9c01451373 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -24,8 +25,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OpDesc;
 class BlockDesc;
+class OpDesc;
 class StaticGraphVarTypeInference;
 // default infer var type context
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index dc86d58f600b83a8ed59f22d9cd73fac7fab13b3..5483ef01c08445b1ee2e44a5288994d7769cf303 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/var_type_inference.h"
+
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -22,6 +24,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 class NOP : public OperatorBase {
  public:
   NOP(const std::string& type, const VariableNameMap& inputs,
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 4875956096a09bea7f9feb3eaf12a16dc28ee255..07387f87411af9c9413e5c83351c8b5836df8284 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -19,6 +19,7 @@
 #include <tuple>
 #include <typeindex>
 #include <vector>
+
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -43,12 +44,12 @@ class NCCLCommunicator;
 }  // namespace platform
 
 namespace framework {
-class Tensor;
-class LoDTensor;
-class SelectedRows;
 class LoDRankTable;
+class LoDTensor;
 class ReaderHolder;
 class Scope;
+class SelectedRows;
+class Tensor;
 }  // namespace framework
 
 namespace operators {
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
index 01a5d09e0728b2af6e9bf650f0d58af43a9a53ab..6e65bc2c932877e5365f4533631d50afae4465b3 100644
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
@@ -21,6 +21,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Variable;
+
 void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 void CopyVariable(const Variable& src_var, Variable* dst_var);
 
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index 511c9c52146ece4b90905cc9d49565103589c1ec..98a8ff9cf3ebf5c877b4611f2cf27d2fe6b1ec20 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -12,12 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <memory>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
+#include "gtest/gtest.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
index 7ba00f52e67f5431bb0dc70dbd5b743ca611fff8..92042e47259861151333e3c1073fc2a85a5f68f2 100644
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/version.h"
-#include <algorithm>
+
 #include <sstream>
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/all_reduce.h b/paddle/fluid/imperative/all_reduce.h
index 81880d0fb89b7e372f5b0b158ae9a206eb64215d..249fb4e11f12beb0c7543d96732ffcae6143c8da 100644
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
@@ -25,9 +25,17 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 
+struct ParallelStrategy;
+
 void AllReduce(const framework::Variable &src, framework::Variable *dst,
                const ParallelStrategy &strategy);
 
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index c980b014b823e21f117bc6e44037349b06a1fdfd..d4a1519b07e8ca5c3fecee2b276aed27fc4e0c44 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -14,20 +14,17 @@
 
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 
-#include <algorithm>
 #include <memory>
-#include <set>
 #include <string>
-#include <unordered_set>
 #include <utility>
 
-#include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/imperative/variable_wrapper.h"
 
 namespace paddle {
 namespace imperative {
 
+class VarBase;
+
 AmpOperators::AmpOperators()
     : allow_ops_(new std::unordered_set<std::string>()),
       block_ops_(new std::unordered_set<std::string>()) {}
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index d1da97e5a39057aed3ed0b4a450bd4a4f5c06984..7ab876c1ce82905dec19380ff33f33e23be7c023 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -26,6 +26,8 @@ namespace paddle {
 namespace imperative {
 
 // Singleton implementation with C++ 11
+class Tracer;
+
 class AmpOperators {
  public:
   ~AmpOperators();
diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc
index 3b8239e566d21b2a15f0829c52b92ba2aa23d4f3..a2fccf7901ffa430e8d8ecb05c6fd1e32b3b45b6 100644
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@@ -16,13 +16,9 @@
 
 #include "paddle/fluid/imperative/data_loader.h"
 
-#include <string.h>
 #include <sys/wait.h>
-
-#include <atomic>
 #include <csignal>
 #include <map>
-#include <set>
 
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 7caeb4378ce3d1ca1d1557054642c9fa184bea39..07f1868b7fa29914b4d362cf2c71d9380ca446be 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/gradient_accumulator.h"
+
 #include <algorithm>
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -136,9 +138,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
     return;
   }
 
-  PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
-                    "dst_numel %d vs. src_numel %d", dst_tensor->numel(),
-                    numel);
+  PADDLE_ENFORCE_EQ(
+      dst_tensor->numel(), numel,
+      platform::errors::PreconditionNotMet(
+          "The number of elements of source tensor and destination tensor "
+          "should be equal, but got the number of elements of source tensor is "
+          "%zu and the number of elements of destination tensor is %zu.",
+          numel, dst_tensor->numel()));
 
   auto data_type = src_tensor.type();
   auto place = src_tensor.place();
diff --git a/paddle/fluid/imperative/jit/op_desc_meta.cc b/paddle/fluid/imperative/jit/op_desc_meta.cc
index f5c009859003c8047ceb44370e01f51527a57591..d256dafb8919a5ee2025ee3bb061d5d688d23b4b 100644
--- a/paddle/fluid/imperative/jit/op_desc_meta.cc
+++ b/paddle/fluid/imperative/jit/op_desc_meta.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/jit/op_desc_meta.h"
-#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/jit/op_desc_meta.h b/paddle/fluid/imperative/jit/op_desc_meta.h
index 506afee6a99b034b1cc1d3ce209d12dd30fce43d..c59b2c885d2b46524288cd983ca55620a6ba9009 100644
--- a/paddle/fluid/imperative/jit/op_desc_meta.h
+++ b/paddle/fluid/imperative/jit/op_desc_meta.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index be93a787d46035838d4d956a8d6881d802b93430..59ff5b4eae4419274412160632ed78c02b298867 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
+
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
+
+namespace paddle {
+namespace imperative {
+class VarBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace imperative {
@@ -198,7 +204,8 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
 
 void ProgramDescTracer::InsertVarIfNotExist(
     const std::shared_ptr<VarBase> &new_var, bool is_input) {
-  PADDLE_ENFORCE_NOT_NULL(new_var);
+  PADDLE_ENFORCE_NOT_NULL(new_var, platform::errors::InvalidArgument(
+                                       "The variable to insert is NULL."));
   if (vars_.count(new_var) != 0) return;
 
   auto new_var_desc = new framework::VarDesc("");
@@ -215,7 +222,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
   }
 
   const auto &inner_var = new_var->Var();
-  PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true);
+  PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true,
+                    platform::errors::InvalidArgument(
+                        "The variable to insert is not initialized."));
   if (inner_var.IsType<framework::LoDTensor>()) {
     const auto &tensor = inner_var.Get<framework::LoDTensor>();
     new_var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -227,8 +236,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
       new_var_desc->SetDataType(framework::proto::VarType::FP32);
     }
   } else {
-    PADDLE_THROW("Not support variable type %s",
-                 framework::ToTypeName(inner_var.Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Not support variable type %s.",
+        framework::ToTypeName(inner_var.Type())));
   }
 }
 
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.h b/paddle/fluid/imperative/jit/program_desc_tracer.h
index d07acec22305e5a340a698ad85dbf15c183e9ccb..8e2e59a49ed7be473a1f89aaefde6bf123a9dea9 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.h
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.h
@@ -21,12 +21,19 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/imperative/jit/op_desc_meta.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace imperative {
+class VarBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 namespace jit {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 03e83301d44a35dc98e9a1aee0e1b22ef2380d50..d4df052a40d307c31978c33d9e14aef9ed4b3b82 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -24,6 +24,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -35,10 +36,18 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 
 class OpBase;
+class GradOpNode;
+class VariableWrapper;
 
 class ThreadSafeNameSet {
  public:
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 115078e7eadfc03153fc95ef05a9d4bb6cd40369..c8fd31fcbffe680da36d03276ec0d4c1095030bc 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/nccl_context.h"
+
 #include "paddle/fluid/platform/collective_helper.h"
 
 namespace paddle {
@@ -21,8 +22,10 @@ namespace imperative {
 void NCCLParallelContext::RecvNCCLID(const std::string &ep,
                                      ncclUniqueId *nccl_id) {
   auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
   std::string host = addr[0];
   int port = std::stoi(addr[1]);
 
@@ -32,27 +35,41 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
   char buffer[1024] = {0};
   int opt = 0;
   // creating socket fd
-  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
-    PADDLE_THROW("create server fd failed");
-  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
-    PADDLE_THROW("set socket opt failed");
+  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) {
+    PADDLE_THROW(
+        platform::errors::Unavailable("Create server file descriptor failed."));
+  }
+
+  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) {
+    PADDLE_THROW(platform::errors::Unavailable("Set socket options failed."));
+  }
 
   address.sin_family = AF_INET;
   address.sin_addr.s_addr = INADDR_ANY;
   address.sin_port = htons(port);
 
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
-    PADDLE_THROW("binding failed on ep: %s", ep);
+  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
+    PADDLE_THROW(
+        platform::errors::Unavailable("Bind on endpoint %s failed.", ep));
+  }
+
   VLOG(3) << "listening on: " << ep;
-  if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
+  if (listen(server_fd, 3) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Listen on server file descriptor failed."));
+  }
 
   if ((new_socket =
            accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
-                  reinterpret_cast<socklen_t *>(&addrlen))) < 0)
-    PADDLE_THROW("accept the new socket fd failed");
+                  reinterpret_cast<socklen_t *>(&addrlen))) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Accept the new socket file descriptor failed."));
+  }
+
+  if (read(new_socket, buffer, 1024) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Read from socket failed."));
+  }
 
-  if (read(new_socket, buffer, 1024) < 0)
-    PADDLE_THROW("reading the ncclUniqueId from socket failed");
   VLOG(3) << "recevived the ncclUniqueId";
   memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
 
@@ -63,8 +80,10 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
 void NCCLParallelContext::SendNCCLID(const std::string &ep,
                                      ncclUniqueId *nccl_id) {
   auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
   std::string host = addr[0];
   int port = std::stoi(addr[1]);
   // struct sockaddr_in address;
@@ -73,15 +92,17 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
   char buffer[1024] = {0};
 
   memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
-  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
-    PADDLE_THROW("create socket failed");
+  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Create socket failed."));
+  }
 
   memset(&serv_addr, '0', sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
   serv_addr.sin_port = htons(port);
 
-  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
-    PADDLE_THROW("invalied address: %s", ep);
+  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
+  }
 
   int try_times = 0;
   while (true) {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index bf02eebdbb6be6638592017b264f0ef5054537ff..3bf032e642bb72f00f4bc045f7ffccff0c7e0263 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -17,12 +17,23 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
index 34570b3a60ec83fdeb1577789271942125b16eb1..85063a6821680f6c990282f8492b3fe9f54bfbf5 100644
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
@@ -19,8 +19,7 @@
 #endif
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <mutex>   // NOLINT
-#include <thread>  // NOLINT
+#include <mutex>  // NOLINT
 
 DEFINE_string(
     tracer_profile_fname, "xxgperf",
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 8ce257a6c37d7d4b0fa5bc3610ee7283fdcf0659..93ea988d638e4b67dde4707ca51f8ff1088d9059 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/nccl_context.h"
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 9dc96fdfe8622e3e78673664637ab50970fe93c6..f85e1f651165646095dae06032cdec962a37a2d1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -44,13 +44,9 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array 
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
-if(WIN32)
-  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) 
-else()
- create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
-endif()
+create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API})
 
-if(NOT APPLE AND NOT WIN32)
+if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
   set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index ebb19fd486cc89c69d70de3fa98954b9ee415f1a..ee1093104834d5758b7abc1a0a538aa9a84d64fe 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -18,7 +18,14 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 #include <algorithm>
-#include <string>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index 1257562972eec85e6503a47fbb7bb3be40d2b83c..621c631b8539beb9b35dcd33f3d14ef5e5745396 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -27,6 +27,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/node.h"
 
+namespace paddle {
+namespace framework {
+class BlockDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index b6b67ce8eceb40cbf0aa98fe56684d76ce6c9602..bb0248008e0306d28a67ab134abb9ce39587f5c2 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -18,11 +18,21 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
index 65d1c545313e110028a92776e73a070d32010420..66d5667a37ce8f9e43b01d204df5ea9014b54564 100644
--- a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
+++ b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
@@ -30,6 +30,8 @@ namespace analysis {
  * The default cudnn workspace is 4G, we set it to 64M in this pass, which
  * is applicable for most inference tasks.
  */
+struct Argument;
+
 class AdjustCudnnWorkSpacePass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
index 7fbdd88e014ba83277e45798b98ab90af3191f99..bbdf8b1009a55f84710778c575a1848798869734 100644
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
+++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
@@ -32,6 +32,8 @@ namespace analysis {
  * So, We added the corresponding inference impl to these ops separately.
  * This pass replaces these ops with corresponding inference ops.
  */
+struct Argument;
+
 class InferenceOpReplacePass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
index a9d58aa2f4cbb5d135221b0d02c633f6f78c8190..a4d60e91e8455c2c27fd9a146a30d03207ff3dc2 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
@@ -16,12 +16,15 @@
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+struct Argument;
+
 class IrInferCleanGraphPass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
index 838ebdbc9d71eb3a73882e4c4c3e4bf6078150e4..613eb04497e61824e1b64b5dbfb4c80c81932227 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
@@ -14,12 +14,15 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+struct Argument;
+
 class IrGraphToProgramPass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index f432188131eddc402e696091ab3723697216aadf..defa0a525f6885a6206f54e6a27190ba181b74a2 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -13,23 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+
 #include <algorithm>
-#include <fstream>
 #include <functional>
 #include <limits>
-#include <map>
 #include <set>
 #include <string>
-#include <type_traits>
 #include <utility>
-#include <vector>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 77da5d40d8dea96e6d7ae833501626894c6d7b37..6d20aee295b7c1d2fe533bf8cd5195cb105afe2a 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -18,9 +18,18 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/port.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 2184574aa1fe3c66728b41f221c1b0bf5fd464e7..4abe293c930e23d4896adb3af25ad0532d95c12c 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
+struct MkldnnQuantizerConfig;
+
 extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kLiteSubgraphPasses;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index ac914700643af2e7e8eca5dcf0bdf8de88e320d6..6c68b385bcbc04844309c581bf3afd848fe4d1aa 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -245,7 +245,18 @@ bool AnalysisPredictor::PrepareExecutor() {
 
 void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #ifdef PADDLE_WITH_MKLDNN
-  VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id="
+  std::vector<std::vector<int>> inputs_shape;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs_shape.emplace_back(inputs[i].shape);
+  }
+  MkldnnPreSet(inputs_shape);
+#endif
+}
+
+void AnalysisPredictor::MkldnnPreSet(
+    const std::vector<std::vector<int>> &inputs_shape) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(2) << "AnalysisPredictor::ZeroCopyRun get_cur_mkldnn_session_id="
           << platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id();
   // In cache clearing mode.
   if (config_.mkldnn_cache_capacity_ > 0) {
@@ -257,9 +268,9 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
         config_.mkldnn_cache_capacity_);
     // Set current_input_shape for caching dynamic shape.
     std::stringstream ss;
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      for (size_t j = 0; j < inputs[i].shape.size(); ++j) {
-        ss << inputs[i].shape[j] << "-";
+    for (size_t i = 0; i < inputs_shape.size(); ++i) {
+      for (size_t j = 0; j < inputs_shape[i].size(); ++j) {
+        ss << inputs_shape[i][j] << "-";
       }
     }
     VLOG(2) << "Set input shape=" << ss.str();
@@ -742,6 +753,18 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) {
+    std::vector<std::vector<int>> shape_vector;
+    auto names = GetInputNames();
+    for (size_t i = 0; i < names.size(); ++i) {
+      auto in_tensor = GetInputTensor(names[i]);
+      shape_vector.emplace_back(in_tensor->shape());
+    }
+    MkldnnPreSet(shape_vector);
+  }
+#endif
+
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
@@ -750,6 +773,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
 #if defined(PADDLE_WITH_MKLML)
   // Frees unused memory allocated by the Intel® MKL Memory Allocator to
   // avoid memory leak. See:
@@ -1048,6 +1074,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
       config);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 365f86c21105a7f1ffb7c300e0ab38c6aaa230fc..c4a7173b0104b767193e7d7eee3b10f272d396a2 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -317,6 +317,17 @@ class AnalysisPredictor : public PaddlePredictor {
   /// \param[in] inputs tensors
   ///
   void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+
+  ///
+  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
+  ///
+  /// Used in AnalysisPredictor::Run(), do not support
+  /// AnalysisPredictor::ZeroCopyRun() now.
+  ///
+  /// \param[in] inputs tensor shape
+  ///
+  void MkldnnPreSet(const std::vector<std::vector<int>> &inputs_shape);
+
   ///
   /// \brief PostReset for Mkldnn multi-thread and dynamic shape input.
   ///
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index ca0a5148f0622a8c848cb18afb94f600a547bbfe..c78cdf24dec561f5fd5643cb50ee243a58b3ab6a 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -373,6 +373,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
     const NativeConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 }
 
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 96b94777304382a9d4be115a84f80ead69249863..be771ac48fc155c4e53dfcd46e8dd707ad9593bd 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -19,18 +19,26 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+
 class NativePaddlePredictor : public PaddlePredictor {
  public:
   explicit NativePaddlePredictor(const NativeConfig &config)
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 08a1a5428193c2d506f511112e4a26d73c382ff1..6a3760e1f749b2b4875df00b01def57c979b3c93 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -51,8 +51,8 @@ if (WIN32)
     set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    safe_set_static_flag()
     if (WITH_STATIC_LIB)
-      safe_set_static_flag()
       add_definitions(-DSTATIC_LIB)
     endif()
   endif()
@@ -136,7 +136,7 @@ else()
   set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
       glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
-  set(DEPS ${DEPS} libcmt shlwapi.lib)
+  set(DEPS ${DEPS} shlwapi.lib)
 endif(NOT WIN32)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 6b7fb0f619a67cc01dac2b09525bb2bfa05207ba..6d283ca56cb652515b5ade923ea85e38142bf08c 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -6,7 +6,7 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-
+MSVC_STATIC_CRT=$7
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 
 cd `dirname $0`
@@ -66,43 +66,49 @@ mkdir -p build
 cd build
 rm -rf *
 
-if [ $(echo `uname` | grep "Win") != "" ]; then
-  # -----simple_on_word2vec on windows-----
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=simple_on_word2vec \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  Release/simple_on_word2vec.exe \
-      --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-      --use_gpu=False
-  if [ $? -ne 0 ]; then
-    echo "simple_on_word2vec demo runs fail."
-    exit 1
-  fi
+for WITH_STATIC_LIB in ON OFF; do
+  if [ $(echo `uname` | grep "Win") != "" ]; then
+    # -----simple_on_word2vec on windows-----
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=simple_on_word2vec \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      Release/simple_on_word2vec.exe \
+        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
 
-  # -----vis_demo on windows-----
-  rm -rf *
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=vis_demo \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  for vis_demo_name in $vis_demo_list; do
-    Release/vis_demo.exe \
-      --modeldir=$DATA_DIR/$vis_demo_name/model \
-      --data=$DATA_DIR/$vis_demo_name/data.txt \
-      --refer=$DATA_DIR/$vis_demo_name/result.txt \
-      --use_gpu=False
-    if [ $? -ne 0 ]; then
-      echo "vis demo $vis_demo_name runs fail."
-      exit 1
-    fi
-  done
-else
-  for WITH_STATIC_LIB in ON OFF; do
+    # -----vis_demo on windows-----
+    rm -rf *
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=vis_demo \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      for vis_demo_name in $vis_demo_list; do
+        Release/vis_demo.exe \
+          --modeldir=$DATA_DIR/$vis_demo_name/model \
+          --data=$DATA_DIR/$vis_demo_name/data.txt \
+          --refer=$DATA_DIR/$vis_demo_name/result.txt \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
+          echo "vis demo $vis_demo_name runs fail."
+          exit 1
+        fi
+      done
+    done
+  else
     # -----simple_on_word2vec on linux/mac-----
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -123,7 +129,6 @@ else
         fi
       done
     fi
-
     # ---------vis_demo on linux/mac---------
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -145,7 +150,6 @@ else
         fi
       done
     done
-
     # --------tensorrt mobilenet on linux/mac------
     if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
       rm -rf *
@@ -167,6 +171,6 @@ else
         exit 1
       fi
     fi
-  done
-fi
+  fi
+done
 set +x
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 03c2aa3fb8094ce2996f513b90589de0ef903ae8..ee5c10b7bf6753b3b6983ce9c96c5e1205a0a1a3 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -14,6 +14,12 @@
 
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace details {
 
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index 213c6891d0e2320689c8c69266d40611f295edc8..be5fe1d64f9e7fbba207878c9ca6740109a41b7c 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -16,10 +16,18 @@
 
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace details {
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 014985661fd927debb48c699a157c0e05265842c..061b83e1d1e4a5180b281c672cf81f6e6a0c51bd 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -27,6 +27,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 76cf1661f305443592c345e879f2d4514dc914d9..30c6c21ec87f36cdfe0f4ef7950236763c13191a 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <string>
+
 #include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/paddle_infer_declare.h b/paddle/fluid/inference/api/paddle_infer_declare.h
index 39c9653f16cefb71a9f2a0ddcc08723d189d411c..e8525f440fe7f2d54d045eedb79aed228513e550 100644
--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
@@ -17,11 +17,7 @@
 #if defined(_WIN32)
 #ifndef PD_INFER_DECL
 #ifdef PADDLE_DLL_INFERENCE
-#ifndef PADDLE_ON_INFERENCE
-#define PD_INFER_DECL
-#else
 #define PD_INFER_DECL __declspec(dllexport)
-#endif  // PADDLE_ON_INFERENCE
 #else
 #define PD_INFER_DECL __declspec(dllimport)
 #endif  // PADDLE_DLL_INFERENCE
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 9073253520466a3711089bc7b7da04a9191e0a42..b10c290b226a7da8c2b55097dae5d6ffe34fc180 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -17,6 +17,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle_infer_declare.h"  // NOLINT
 
 ///
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 31915496893e6242dc7cd10ffd48af278d124245..c1bf4c974fac8c80c3e8e31fbd247332a325e2aa 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -131,7 +131,9 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
   PADDLE_ENFORCE_EQ(
       input_names.size(), in_size,
       paddle::platform::errors::InvalidArgument(
-          "The number of input and the number of model's input must match."));
+          "The number of input and the number of model's input must match. The "
+          "number of input is %d, the number of model's input is %d.",
+          input_names.size(), in_size));
   for (int i = 0; i < in_size; ++i) {
     auto input_t = predictor->GetInputTensor(inputs[i].name);
     std::vector<int> tensor_shape;
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index c497ab384b5fac74b5241d61517485fd8f2b40c4..84e011c6505a8fe974effbecf54101e0e51d29fa 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -47,7 +47,9 @@ void Init(const std::vector<std::string> argv) {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -133,9 +135,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   // model_from_memory is false in separate parameters.
   LoadPersistables(executor, scope, *main_program, dirname, "",
@@ -151,9 +154,10 @@ std::unique_ptr<framework::ProgramDesc> Load(
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_filename,
                    false /* model_from_memory */);
@@ -165,9 +169,10 @@ std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
     const std::string& prog_buffer, const std::string& param_buffer) {
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(prog_buffer));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_buffer,
                    true /* model_filename */);
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 1a428e205cb0773697855c382d917778c41be205..57aeee99ba2bf80eb21773b24b47c23f1911ed53 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -12,8 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace nvinfer1 {
+class IActivationLayer;
+class ITensor;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index f67370b33828b77fec166017331c550698eaa658..2f4f9320607e3d77b7b2aaa31dca7e64c76d73a7 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -12,9 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <math.h>
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace nvinfer1 {
+class IScaleLayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 5d63aa2ace86cb89917126f3a6fef9d0e9839e8c..5ecf19233887754c1a38c9c74c4b75d345609f67 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index aa03bc44bd629513d96cda541c0b7162629bfdc8..f582d7e07055bd2083bf375b89ac98bee4990711 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 0541d8917057c3a96ea3b2ffbb0bec283e00f683..d11dbc16e87d25e86abcf5b8425cf7036550b6e4 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 9fff558c583596215c191a31e95b4e9b2aad058b..b846b3033f6742a20cc42529db8f9eb302b7a71f 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -13,6 +13,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 0dccd3cc6390af736aa4d205a12577fd9ee14f11..cd16ed7396532cb987263bc8a9c510e516dc7bc1 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index 7927b6cd1bb559b1c4ce81d2846942a0db31e5fc..aad822b33546e68d67912ac9f9f2581cd2f8e5e8 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
index 3b6e464a117d3eedb4848dcaaf5c06fdb453fac1..f2c1bafb4ae781ea340cc84f4465733948c35ded 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
index 39820068ca848c0fc758290b031867447b113795..967f79a1643a59df38721b8f7d9158e56391f2d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
index 8a0b50f4744c456647688c6d6b0533518cec0bf4..d746c51c5c5706eea14f371eaa74f0e76adfed34 100644
--- a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
 
+namespace nvinfer1 {
+class IPluginLayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index 29826909c3ce34e8ac1d780352e412a776feb1d6..e348de9877f463e22b317c658d60f5723942afd1 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -14,6 +14,18 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 5b6aaad49833cedbd8d1ee0ec5d24c7f983190e6..c99528b207b6c02675b213977fb5f161cef21be5 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index dd594404d3316ada6e20624c074368f241ca5cdd..7ddedf969fd1c16b20ef42e97835f04eb819dcc0 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 864e440920e3b49feee7c7e9a839a15f3a961f9c..c031630f36d789bb205c23ae7a899089761f5391 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -15,6 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
index f9a1fe41ddc046aad8cc3a5397453b0f68c1a112..18c97890d72a51de412421b6ae48f382728ae8fc 100644
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
index 326915dfff528db8a373e04dc4dde26e37766fee..7090e298ddc3de453fe9177a841ca9d605f15094 100644
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 9f4a048961f8d285f9280161e3ea3e3630b63556..0388154427e3a63d03d3c07aaadbc360454a6001 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index e220d80f0d79da5eab98aa7a18a5093f9f4a55c4..ab82a6578fb4dd367628808d894e54f6741a73d1 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 754979f77acd7a3b4818cdf16ef9c525bf1d82ea..1f7ea7ea04404712a29ef486004601f63510ea74 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 #include <NvInfer.h>
-#include <cuda.h>
 #include <glog/logging.h>
 #include <string>
-#include "paddle/fluid/inference/analysis/helper.h"
+
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -26,6 +25,10 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+namespace plugin {
+class PluginTensorRT;
+}  // namespace plugin
+
 int TensorRTEngine::runtime_batch_ = 1;
 
 void TensorRTEngine::InitNetwork() {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index a85ed483c1d12c3f2eecc5ed4bcb99937397a765..71625210054b36938f1da942066e5e346ef2beb5 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
@@ -33,10 +34,20 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+namespace plugin {
+class PluginTensorRT;
+}  // namespace plugin
+
 using FluidDT = framework::proto::VarType_Type;
 using TRT_DT = nvinfer1::DataType;
 
@@ -94,6 +105,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
 }  // NOLINT
 
 class TRTInt8Calibrator;
+
 /*
  * TensorRT Engine.
  *
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 31128ba8c5d42acac0dff321adbc40dbb0ce0c19..23aacedd693b4d7ea0622374d83c199d964a25e4 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -14,6 +14,12 @@
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 76784c7445e841952ea34a763bbdd5a679a366cc..9113525a5c94fda633f08188687e822822bb7bce 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -17,9 +17,16 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
index 76b0832c546b92068364ba6b2eda65a04742e5f0..0bf8a1691e2192b278fcd209162135027ed24e71 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
@@ -27,8 +27,8 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
 
   PADDLE_ENFORCE_EQ(
       Has(plugin_type), true,
-      platform::errors::NotFound(
-          "trt plugin type %s does not exists, check it.", plugin_type));
+      platform::errors::NotFound("TensorRT plugin type `%s` does not exists.",
+                                 plugin_type));
   auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
   owned_plugins_.emplace_back(plugin);
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
index ed825801fc4b64e3c220a0d357dc7e5c5bde9c90..076dfbcf8f095ff15a265239c7b267db952b14be 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
@@ -26,6 +26,7 @@
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 6fcb70c6d3299f830e1e95e328b2645aedf9cc31..16751c764bd03af9bbb7cbd77dd9287c17150dd5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -103,12 +103,11 @@ struct Serializer<std::vector<T>,
     DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
-    PADDLE_ENFORCE_GE(
-        *buffer_size, nbyte,
-        platform::errors::InvalidArgument("Expect buffer size >= value size in "
-                                          "trt plugin deserialization, but got "
-                                          "buffer size = %d, value size = %d.",
-                                          *buffer_size, nbyte));
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte,
+                      platform::errors::InvalidArgument(
+                          "Insufficient data in buffer, expect contains %d "
+                          "byte, but actually only contains %d byte.",
+                          *buffer_size, nbyte));
     std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 72962c733ecf6a7bc6871fd3a5c65d6156b084d4..7c763858bb2101543af1dce0f3b81e964257a696 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -12,13 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
index 5815bc9a1464293e0a56f05e34183580eac96cea..b4b7ee50dc3511cef1ef2e5dbef9251f7ea7fd67 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -23,6 +23,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 146d5932577fb7f4e2e33f6d98c51756ffd02073..5d6970fc4e3856a1945dfcc407b2d16b5032d3df 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML)
     # seq_pool1
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc)
     if(NOT WIN32)
-        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+        set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120)
     endif()
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
@@ -198,6 +206,9 @@ inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
 if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
 endif()
+if (WIN32)
+    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200)
+endif()
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -212,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 # transformer, the dataset only works on batch_size=8 now
 set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
+inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
   ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
        --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
@@ -258,6 +277,10 @@ set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
 download_data(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz")
 inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP}
 	${RESNET50_MODEL_DIR} true)
+if (WIN32)
+    set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 200)
+endif()
+
 
 # mobilenet with depthwise_conv op
 set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv")
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f0778b83e52e93b1b30c06cf2bd950bdb62f3b8
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+// Compare Deterministic result
+TEST(Analyzer_seq_pool1_compare_determine, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..099ff1f31a759a694ff81aa98b961f935f0d2109
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+TEST(Analyzer_seq_pool1_compare, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1fbcbf1a3f4275ae7973ca33f85d886c40bb1da4
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
+TEST(Analyzer_seq_pool1_compare_zero_copy, compare_zero_copy) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  AnalysisConfig cfg1;
+  SetConfig(&cfg1);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  std::vector<std::string> outputs_name;
+  outputs_name.emplace_back(out_var_name);
+  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
+                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
+                             input_slots_all, outputs_name);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b8ccb8cee507b97dc50d0dcfa3677bc8ebb51b6c
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+// Check the fuse status
+TEST(Analyzer_seq_pool1_fuse_statis, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
+  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
+  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
+  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
+  LOG(INFO) << "num_ops: " << num_ops;
+  EXPECT_EQ(num_ops, 171);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ccd95f2a176de5b822eba98be7f53d779927e9f
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg, use_mkldnn);
+
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_seq_pool1_profile, profile) { profile(); }
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
similarity index 70%
rename from paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
index 9f1556cdb871aa3e5bbe613aa98299c162661c42..0dac11bc3452d3e3e88d86a76d439dd5b489c9c0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#pragma once
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+namespace seq_pool1_tester {
 
 // diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
 static const char out_var_name[] = "reduce_sum_0.tmp_0";
@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
   cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
 }
 
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg, use_mkldnn);
-
-  std::vector<std::vector<PaddleTensor>> outputs;
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_seq_pool1, profile) { profile(); }
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_seq_pool1, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_seq_pool1, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-// Check the fuse status
-TEST(Analyzer_seq_pool1, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
-  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
-  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
-  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
-  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
-  LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 171);
-}
-
-// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
-TEST(Analyzer_seq_pool1, compare_zero_copy) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  std::vector<std::string> outputs_name;
-  outputs_name.emplace_back(out_var_name);
-  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
-                             input_slots_all, outputs_name);
-}
-
+}  // namespace seq_pool1_tester
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f26ec57103b76500eab99ef11eadc694e2c9b192
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e5484c9ea01df81c9982743feefcf4a71f421a1
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+
+// Check the fuse status
+TEST(Analyzer_Transformer, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+}
+
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..caeba3277163b2a15183972fb07d315bd951ccde
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> outputs;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_Transformer, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+#endif
+
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
similarity index 82%
rename from paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
index 9726109bf89ac0d5e1048f6cae0483248696f3e2..e43456ed8322e759e1e7f56c11621d696b8efb82 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
@@ -11,11 +11,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
+namespace analysis {
+namespace transformer_tester {
 
 struct DataRecord {
   std::vector<std::vector<int64_t>> src_word, src_pos, trg_word, init_idx;
@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   }
 }
 
-// Easy for profiling independently.
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_Transformer, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
-#endif
-
-// Check the fuse status
-TEST(Analyzer_Transformer, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-TEST(Analyzer_Transformer, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
-#endif
-
+}  // namespace transformer_tester
+}  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
index 074a397e323b5bfa0d09cd6684d093556b1093f5..8c76a03d29861defecc3b41130edebef6b767746 100644
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/utils/benchmark.h"
-#include <sstream>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/utils/io_utils.h b/paddle/fluid/inference/utils/io_utils.h
index 853aba168b524eb289d4f57645bcaee809acdd0b..de2c7b26d3382daba0a549fa71b73c2bbc6c6631 100644
--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
@@ -16,9 +16,14 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 
+namespace paddle {
+struct PaddleTensor;
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
index 990bef359499834c3a7cb025c3fb1d94ceea958e..6828924c300fdfec6640e7b19a2c06b0826aa455 100644
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -46,7 +46,9 @@ struct Registry {
 
   template <typename ItemChild>
   void Register(const std::string& name) {
-    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    PADDLE_ENFORCE_EQ(items_.count(name), 0,
+                      platform::errors::AlreadyExists(
+                          "Item `%s` has beed registered.", name));
     items_[name] = new ItemChild;
   }
 
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 71250766034753e17687c9529d9fe1c314b6b662..6fef5cae8d6af32a8b5b2eff0c74ab94410bd070 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 379c8d00960947880e048c6990e7ba856ddc68dd..e54748a53679d1363246896a0982275c4ef09535 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -18,6 +18,7 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index 3e10be037bd317f3bca8b2d8519a40df97303bc7..0029991116200a64db84c57afcb0edb17065a098 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include <gflags/gflags.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index 3748805b1ceb9e392a767e8c08577c1f47bfe436..1e793d1617af38c3e69d5c5690b40524b031b3c7 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include <gflags/gflags.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index e1c7b243becaa16531912c64c9f3d95e76cf5092..518b31e943048d51584dfa894fbf0aed96ee35f9 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+
 #include "gflags/gflags.h"
-#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DECLARE_string(allocator_strategy);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index 27257883d558e7b3a09f8aeb5264d093e7d5c480..cbc126264ac2c09ef2532bf21834a648c02473ec 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -19,6 +19,7 @@
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 8865bf0b0db27f6c23ecf486daa076126223daf6..685248a88f71d695095bd844dea06558e5cbcee6 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
+
 #include <cstdlib>
 #include <vector>
+
 #include "gtest/gtest.h"
 
 DECLARE_bool(free_idle_chunk);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 957dac4d03e33b7e40d7781a033a77da3a814bdc..2b8d2164f68ad0b9ff6d000f8ddee74ddce6434d 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -13,11 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
-
-#include <cmath>
-#include <list>
-#include <map>
-#include <string>
+#include <math.h>
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 64a552e4fd2af1f661e3174e5041ffc71f74fa2c..a6015417b1227b00a77373d4cc108253e44c10e1 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -16,8 +16,15 @@
 #include <array>
 #include <list>
 #include <map>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
+namespace paddle {
+namespace platform {
+class Place;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index 7eed81a712ad3505b5c0833370df8b603c7d3953..d463ad1f5ebb1f5e31008a716f40b99b55895496 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -13,9 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
-
-#include <algorithm>
-#include <limits>
 #include <utility>
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index fd0996f7748ef407262dba7bca705af9b5fb9674..5e1733bd839de2c0e219a40c569e492b1aca1d56 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index b096fbc112c55a482e522ed27734232f18c73afa..407f0f25935bf1dde5159aab8c4e9954f6fb6ccb 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -16,8 +16,6 @@
 
 #include <stdlib.h>
 
-#include <string>
-
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 4af77e6e057f54d15dcb0248ba6cf36f6f00c2f1..1b8418bc8494a270645935f13dd3c8ea62849ed2 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -15,6 +15,7 @@
 #include <memory>
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index 0ef084bafd0c9f9edb9c290653fe5448771f3df2..77e8d9943d00f7badd24cdb8603d16094ba9e84b 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -19,13 +19,8 @@
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
 #include <random>
 #include <string>
-#include <utility>
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/mmap_allocator_test.cc b/paddle/fluid/memory/allocation/mmap_allocator_test.cc
index 5b66920be2a0dc988794cdd1156dc64fb201aa5d..bcb02e0479290c92ab064ae1a995758c1d079161 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator_test.cc
@@ -16,8 +16,6 @@
 
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 
-#include <sys/types.h>
-
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 913d583099c3f403a8262ff716fcd4c9ab930d22..4cf1bd6123e5fb3b99c60cc0a2750ef6295ab870 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -18,8 +18,10 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
+
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 3ea2ecf3538892589dc2636f00c91619640eb798..42dd50af7293dc7f52add81eee7b6a98071bb072 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -14,9 +14,6 @@
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 0e81f5f2238f755de27750b405e771146b3cbf7d..5d3e133f97d38b80ebcdcfa92dec7368463efb55 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+
 #include <algorithm>
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
@@ -20,6 +21,7 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) {
 
   size_t thread_num = 4;
   size_t sleep_time = 40;
-  size_t extra_time = 10;
+  size_t extra_time = 20;
 
   // Reserve to perform more tests in the future
   std::vector<std::shared_ptr<Allocator>> allocators;
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index bc07ad0c4dce3fccb3570d35b31e111b7253cd58..10ca4b828a4bb508ed91d15f2649c3d0d5e1da9a 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index faa87f1f01a8bb0188f41d904cf48176c2a9067d..6ac99744d79380803925f973c5b39262685e1ff0 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
-
 #include <algorithm>
 #include <utility>
-
 #include "glog/logging.h"
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 780126f57c52f97844aaac0c0b4b09ead9b14577..1722acd10aa38e33f3c11aa8eac7cb50dce9fed4 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -21,12 +21,9 @@ limitations under the License. */
 #endif
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-
 #include <fstream>
 #include <string>
 
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index 34bb40d549dff1d711cd8924630cdf528f41ed76..ea4897494f72b96e85911e03b651af1b4eac3298 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
 #include <memory>
-#include <vector>
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 9ba572acaca9eba2b913847c52e5a54e19d79bdf..73487795f752eab69e660154c2e35817b2c80368 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
+
 namespace paddle {
 
 namespace platform {
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 33d8fb828f86e1f689ad4f67ad0033b45ce2671e..1903b9e30d80022c7351b155c1e0c149cf562a5b 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <memory>
-#include <string>
-#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 5a3660cee85762f3d76129dfb694eeb6d87bb52c..95214484dca99e718f8ec62225ce40ce3ffd7323 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   }
 };
 
-//
+// AbsGrad: dx=dy if x >=0 else -dy
+// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
+template <typename T>
+class AbsDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("abs_grad_grad");
+    // input1: x
+    op->SetInput("X", this->Input("X"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
-//
 template <typename T>
 class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
  public:
@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR(
     std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(),
                      ops::ActFwdInplaceInferer, void>::type);
 REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInferer);
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::AbsDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    abs_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::AbsGradGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(abs,
                        ops::ActivationKernel<paddle::platform::CPUDeviceContext,
@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL(
                               ops::AbsGradFunctor<int>>,
     ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                               ops::AbsGradFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 48ec90471f0becf921e7e68eb8722544885aaa7a..072d952d2618d2c9dbbe27641dcd951a634753a0 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::ExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ==========================   exp register  ============================ */
+/* ==========================   abs register  ============================ */
 
 REGISTER_OP_CUDA_KERNEL(
     abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>,
@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::AbsGradFunctor<int64_t>>,
     ops::ActivationGradKernel<plat::CUDADeviceContext,
                               ops::AbsGradFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 00a7c063c9155488d117332d5ef3541d16d76bdb..646f546bffb2ced3830c119b5f24f6d3fcad0e78 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
   }
 };
 
+template <typename T>
+struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* Out, const framework::Tensor* ddX,
+                  framework::Tensor* ddOut, framework::Tensor* dOut,
+                  framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "AbsGradGrad"));
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "AbsGradGrad"));
+      ddout.device(*d) = ddx * x.sign();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index 629fedba6e3db474869ebddc02470c2ff007e658..e5fcd270eb8b8fa58175e11e955161ebfbb2846c 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -69,12 +69,18 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("alpha", "The scale of Original Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& alpha) {
-          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+          PADDLE_ENFORCE_GE(
+              alpha, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'alpha' must be greater than or equal to 0.0."));
         });
     AddAttr<float>("beta", "The scale of Position Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& beta) {
-          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+          PADDLE_ENFORCE_GE(
+              beta, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'beta' must be greater than or equal to 0.0."));
         });
     AddComment(R"DOC(
     Add Position Encoding Operator.
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index ea1a93b5826496aafc7a092d91ad4b4a4bad7215..89475e3a2ec072d6b331ef8136c0e5e29f45e25d 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -12,20 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <paddle/fluid/operators/math/concat_and_split.h>
-#include <numeric>
 
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
 using LoD = framework::LoD;
 
 struct ArrayToLoDFunctor;
+
 template <typename DeviceContext>
 struct ArrayToLoDFunctorImpl {
   const ArrayToLoDFunctor *prev_functor_;
diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc
index da0e5fda63633084d766dab763d675d6943a828d..3e4250389fcfce26ddb4a845fd681955650b98ee 100644
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -13,10 +13,24 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/tensor_formatter.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+class Variable;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 const char kCond[] = "Cond";
 const char kData[] = "Data";
 const char kSummarize[] = "summarize";
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index f8c1216e9722b085314885c5714a3dd58d6e7176..e5bceae1c9520901a4d18e6e8321c00f922e2d8f 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -14,9 +14,23 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/assign_op.h"
 
-#include <memory>
 #include <string>
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index c2154f78bbe97418f2c7388a000dc833134d0c84..ed4ba24a74bea2aaedbbef4908904e2696826290 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -19,6 +19,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class AssignFunctor {
diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc
index 58f360ad6059e4fe9743439cd0a16b1d7b9e241f..f0ec04a1f209c747f147f4069746e7935a85ccec 100644
--- a/paddle/fluid/operators/assign_op_test.cc
+++ b/paddle/fluid/operators/assign_op_test.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/assign_op.h"
 
 #include <gtest/gtest.h>
-#include <iostream>
-#include <string>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index a35e5a80a9eefaa537a4e16e899c24c370db0ba9..1589f9e8911f324d7a930ebb69cba8b2e8fe1c07 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -13,8 +13,23 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/assign_value_op.h"
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index b462c43d23a534c3520a2a852252fe0333222d77..6c2e5b9ad68987c99675330b9f5591f39244d270 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -76,7 +77,10 @@ class AssignValueKernel : public framework::OpKernel<T> {
         value_name = "int64_values";
         break;
       default:
-        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported data type(code %d) for AssignValue operator, only "
+            "supports bool, int32, float32 and int64.",
+            dtype));
         break;
     }
     CopyVecotorToTensor<T>(value_name, out, ctx);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index dcfe8bb1bb48a505f5526f6471e8ce9ba848b5b3..7a88403aa9daa78f1093115124eb19167bf6e99d 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetInput("SavedMean", this->Input("SavedMean"));
   op->SetInput("SavedVariance", this->Input("SavedVariance"));
   if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
+    op->SetInput("Mean", this->Input("Mean"));
     op->SetInput("Variance", this->Input("Variance"));
   }
   op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
                    "BatchNormDoubleGrad");
   }
 
-  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
   OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
 
   // check output
   OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
 
   const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+  const int C =
+      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+           ? x_dims[1]
+           : x_dims[x_dims.size() - 1]);
+
   if (ctx->HasOutput("DX")) {
     ctx->SetOutputDim("DX", x_dims);
   }
@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
 
     Tensor inv_var_tensor;
     if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
       const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
       inv_var_tensor.Resize({C});
 
       T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
         //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
         //          np.sum(dy,
         //          axis=(n,h,w)) * (x - mean) *
-        //          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+        //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
         //          inv_var
         //          *
         //          np.mean(dy, axis=(n,h,w)) -
         //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-        //          axis=(n,h,w))))
+        //          axis=(n,h,w)))
 
         if (ddX) {
           dx_arr +=
@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                                C, sample_size);
       ddy_arr.setZero();
       if (use_global_stats) {
-        // math: ddy = r * ddx * inv_var
+        // math: ddy = r * ddx * inv_var + ddbias +
+        //           ddscale * (x - mean) * inv_var
         if (ddX) {
           ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
         }
@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                        .replicate(1, sample_size) /
                    sample_size);
         }
-        if (ddScale && ddBias) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+      }
 
-          ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-          Tensor ddbias_tile;
-          ddbias_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddbias_tile_data(
-              ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+      if (ddBias) {
+        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+        Tensor ddbias_tile;
+        ddbias_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddbias_tile_data(
+            ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
 
-          ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-          ddy_arr += ddbias_tile_data;
-        }
+        ddy_arr += ddbias_tile_data;
       }
+
       if (data_layout == DataLayout::kNCHW) {
         VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
         TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index d2cf38049300578eb1626d39c0959b9beed13cdd..f24a3c316a05a8bf171812be0a6b3445488aeb58 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -26,25 +26,47 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of %s should not be null.", Type());
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of %s should not be null.", Type());
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", Type());
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", Type());
 
     auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE_GT(shape.size(), 0);
+    PADDLE_ENFORCE_GT(shape.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Shape size must be larger than 0, but received: %s.",
+                          shape.size()));
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto output_dim = framework::make_ddim(shape_int64);
 
     int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-    PADDLE_ENFORCE_GE(input_dim_idx, 0);
-    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+    int input_dim_size = static_cast<int>(ctx->GetInputDim("Input").size());
+    PADDLE_ENFORCE_GE(input_dim_idx, 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimension index must be larger "
+                          "equal than 0, but received: %s.",
+                          input_dim_idx));
+    PADDLE_ENFORCE_GT(input_dim_size, input_dim_idx,
+                      platform::errors::InvalidArgument(
+                          "Input dimension size must be larger than "
+                          "input dimension index, but received input "
+                          "dimension size: %s, input dimension index: %s.",
+                          input_dim_size, input_dim_idx));
 
     int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-    PADDLE_ENFORCE_GE(output_dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+    int output_dim_size = static_cast<int>(shape.size());
+    PADDLE_ENFORCE_GE(output_dim_idx, 0,
+                      platform::errors::InvalidArgument(
+                          "Output dimension index must be larger "
+                          "equal than 0, but received: %s.",
+                          output_dim_idx));
+    PADDLE_ENFORCE_GT(
+        output_dim_size, output_dim_idx,
+        platform::errors::InvalidArgument(
+            "Output dimension size must be larger than output dimension index, "
+            "but received output dimension size: %s, output dimension index: "
+            "%s.",
+            output_dim_size, output_dim_idx));
 
     output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
     ctx->SetOutputDim("Out", output_dim);
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index c177dad8028222da39ba85a78ba684f8933e07b5..3cb3f1d48bfa7108f739463b40bf23f17c18f651 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,12 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
 #include <string>
 
 #include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 5ec34e574504e1058021a0623d09d4d33cf75c66..654df5ccd5e9df324f6e127addadd4e71a641d94 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
     CreateInputVarDesc();
     CreateOutputVarDesc();
   } else {
-    PADDLE_THROW(platform::errors::NotFound("Operator '%s' is not registered.",
-                                            config_.op_type));
+    PADDLE_THROW(platform::errors::NotFound(
+        "Operator '%s' is not registered in OpTester.", config_.op_type));
   }
 
   if (config_.device_id >= 0) {
@@ -81,7 +81,8 @@ void OpTester::Run() {
       platform::EnableProfiler(platform::ProfilerState::kAll);
       platform::SetDeviceId(config_.device_id);
 #else
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "'CUDAPlace' is not supported in CPU only device."));
 #endif
     }
 
@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
   } else if (str == "fp64") {
     return framework::proto::VarType::FP64;
   } else {
-    PADDLE_THROW("Unsupported dtype %s.", str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpTester.", str.c_str()));
   }
 }
 
@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
       case framework::proto::AttrType::INTS:
       case framework::proto::AttrType::FLOATS:
       case framework::proto::AttrType::STRINGS:
-        PADDLE_THROW(
-            platform::errors::Unimplemented("Not supported STRINGS type yet."));
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported STRINGS type in OpTester yet."));
         break;
       case framework::proto::AttrType::LONG: {
         int64_t value = StringTo<int64_t>(value_str);
@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
       } break;
       case framework::proto::AttrType::LONGS:
       default:
-        PADDLE_THROW("Unsupport attr type %d", type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", type));
     }
   }
 }
@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
     }
     is.close();
   } else {
-    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpTester.", initializer.c_str()));
   }
 
   if (!platform::is_cpu_place(place_)) {
@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
                           static_cast<double>(1.0), item.second.initializer,
                           item.second.filename);
     } else {
-      PADDLE_THROW("Unsupported dtype %d.", data_type);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported dtype %d in OpTester.", data_type));
     }
 
     VLOG(3) << "Set lod for tensor " << var_name;
@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
            << "\n";
       } break;
       default:
-        PADDLE_THROW("Unsupport attr type %d", attr_type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", attr_type));
     }
     ss << GenSpaces(--count) << "}\n";
   }
@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
 TEST(op_tester, base) {
   if (!FLAGS_op_config_list.empty()) {
     std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                   FLAGS_op_config_list.c_str());
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin), true,
+        platform::errors::InvalidArgument("OpTester cannot open file %s",
+                                          FLAGS_op_config_list.c_str()));
     std::vector<OpTesterConfig> op_configs;
     while (!fin.eof()) {
       VLOG(4) << "Reading config " << op_configs.size() << "...";
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
index 818e5f64edc2c1d213659c48d282df75625676ca..e9477798858d13e7a2862081561634011f9156c8 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
   } else if (dtype_str == "fp64" || dtype_str == "double") {
     dtype = "fp64";
   } else {
-    PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpInputConfig.", dtype_str.c_str()));
   }
   VLOG(4) << "dtype of input " << name << " is: " << dtype;
 }
@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
   const std::vector<std::string> supported_initializers = {"random", "natural",
                                                            "zeros", "file"};
   if (!Has(supported_initializers, initializer_str)) {
-    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpInputConfig.",
+        initializer_str.c_str()));
   }
 
   initializer = initializer_str;
@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
     }
   }
   EraseEndSep(&lod_str);
-  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  PADDLE_ENFORCE_GE(
+      lod_str.length(), 4U,
+      platform::errors::InvalidArgument(
+          "The length of lod string should be "
+          "equal to or larger than 4. But length of lod string is %zu.",
+          lod_str.length()));
   VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
 
   // Parse the lod_str
@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
 
 OpTesterConfig::OpTesterConfig(const std::string& filename) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                 filename.c_str());
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin), true,
+      platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.",
+                                        filename.c_str()));
 
   Init(fin);
 }
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 5b7bcde21a99f23b653cc8b822aa3e22539e9d82..d67d90c348e6f1db9fff604b3eff7b6a79141d07 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -33,29 +33,37 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     auto out_vars = context.MultiOutputVar("Output");
 
     PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
-                      "The CoalesceTensorOp has no input.");
-    PADDLE_ENFORCE_EQ(
-        in_var_names.size(), out_var_names.size(),
-        "The number of CoalesceTensorOp's input and output is not match.");
+                      platform::errors::InvalidArgument(
+                          "The CoalesceTensor operator has no input."));
+    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of CoalesceTensor operator's input and "
+                          "output is not match, "
+                          "input number is %u, output number is %u.",
+                          in_var_names.size(), out_var_names.size()));
 
     // Input & Output check: only support LoDTensor
     for (size_t i = 0; i < in_var_names.size(); ++i) {
       PADDLE_ENFORCE_NOT_NULL(
           in_vars[i],
-          "The input variable %s of CoalesceTensorOp does not exist.",
-          in_var_names[i]);
+          platform::errors::NotFound("The input variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     in_var_names[i]));
       PADDLE_ENFORCE_NOT_NULL(
           out_vars[i],
-          "The output variable %s of CoalesceTensorOp does not exist.",
-          out_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          in_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The input variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          out_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The output variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
+          platform::errors::NotFound("The output variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     out_var_names[i]));
+      PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The input variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
+      PADDLE_ENFORCE_EQ(out_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The output variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
     }
 
     auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
@@ -64,7 +72,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < in_var_names.size(); ++i) {
         PADDLE_ENFORCE_EQ(
             in_var_names[i], out_var_names[i],
-            "The input and output variable of CoalesceTensorOp is different.");
+            platform::errors::InvalidArgument(
+                "The input and output variable of CoalesceTensor operator is "
+                "different, %dth input is %s, %dth output is %s.",
+                i, in_var_names[i], i, out_var_names[i]));
       }
     } else {
       // Init the output as input
@@ -134,16 +145,25 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
       const size_t &size_of_dtype, const platform::Place &place) const {
-    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
+    PADDLE_ENFORCE_EQ(
+        lod_tensors.size(), var_names.size(),
+        platform::errors::InvalidArgument(
+            "The number of input tensor and variable does not match, the "
+            "number of input tensor is %u, the number of input variable is %u.",
+            lod_tensors.size(), var_names.size()));
     *numel = 0;
     std::stringstream ss;
     ss << "alloc_space_for_vars: ";
     for (size_t i = 0; i < var_names.size(); ++i) {
       PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
-                        "%s is not initialized.", var_names[i]);
+                        platform::errors::InvalidArgument(
+                            "Tensor `%s` is not initialized.", var_names[i]));
 
       auto size = lod_tensors[i]->numel();
-      PADDLE_ENFORCE_GT(size, 0);
+      PADDLE_ENFORCE_GT(
+          size, 0,
+          platform::errors::InvalidArgument(
+              "The number of tensor `%s`'s elements is 0.", var_names[i]));
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
          << " addres:" << lod_tensors[i]->data<void>() << ", ";
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index bcb529f1570828d2bcca6d4275a213be4d5633eb..835b49e57bc0922a0d0be7895b57275ba31d2173 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index 34054103aa0cf195ed394df860a8219ff3aa0157..ec8d651819502021c488d20c4a297e95ec664387 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index 9d27a9ceb30704463270b3922c3584febcc05c9e..efc19659c83ec35c9650d3184654f97d23940745 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
index 4e8b6f9d0a937ec4a5e7205bbd9ae834a2d305db..7935a1f722e553ff66bfaeb2f1cf2bb7876f95fb 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index 3cfb1723f18d326b33ed6c332f595010c5241f9a..5ab07ef026bac5bef7386b0789803933cd8fdf2a 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
index 61f76c178d03afad6eddf6848badb5894b51dcd8..1a78427cd19ee20440f1f47515c663191431b587 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index c9ed8b67647eab2b3de8bd132bb70cd2bf16edc0..68061e6ae6bea097b7a2bc5ee19d58c05fd21848 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -14,6 +14,19 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 8fe7fce21e465af8af4d045c29dbc12ab9bc3c84..06e90cdff80455c688d990f30bd6f88910be3cce 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index a3bf1f4dfb1cb09fc864f891dda793ecde9027c6..ccad96320a757295b8c5256260852660fc3dcde8 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -14,19 +14,17 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL)
 #include <nccl.h>
 #endif
-#include <stdint.h>
-#include <ostream>
 #include <string>
 
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index e2b09be5a9dfff0111ab80d89bdd76b99517738f..2822129b19886e230b7bcbc09429168b49240c92 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -11,25 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#if defined(PADDLE_WITH_NCCL)
-#include <nccl.h>
-#endif
-
-#include <stdint.h>
-#include <ostream>
 #include <string>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-
-#if defined(PADDLE_WITH_NCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
index 425351877689f7e3ad8e0a46d2226f5f751a4016..41a07f94399514b5ba8b207e5b5c21bd9793ce92 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
index 7e260346b4bdd8aced0df59c72f5adb4c479e8d0..e03da37360f4749e9375e59fa79a65adb3a077a0 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
index 8e849641e639eeceb48fc95656b269988c827006..77bb96347f94335ae481df16108d7b294590d4db 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
index 77a75ed0b7af2a7946c02bfa0f33038aa0090c5b..83f7fce1ec6b7d73d3b0451b8a2d83d87be9408e 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
index 64935df856ec79f427bdcd21e03b7c493c31ac1e..f6c1c5d50e864fd02d65f7993257857a1d36e2ab 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
index 07e431f7bc838caa9bc3abdcd0be1beb94b96635..83db107b36faf721f7a8e3a1ef40b960c93f97eb 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
index 3e20cee7e186a462aedc1881c6e34cacc8d09de0..e59ec85fefd13a5454127dab6a27fd5ed2c6b253 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
index d9826422c16cb67f9f7101643918a83898c606b3..39c8716a92a36efdec75d4b159709fdaabe9a00b 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 64b60165722a5e5b978c4ead29d2c0fdb344354f..00f366e62120d5cb1adfdf384c3a0759ffbfa12d 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -11,19 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#if defined(PADDLE_WITH_NCCL)
-#include <nccl.h>
-#endif
-
 #include <string>
 
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 5405ea70ef65505fe02bb6f2a3d3b7ee79ce6bfa..7e5311a210323aaa1584345ece3a2059ed956936 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -11,18 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#if defined(PADDLE_WITH_NCCL)
-#include <nccl.h>
-#endif
-
 #include <string>
 
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -55,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase {
 class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
-    AddInput("X", "(Tensor) Dependency of the variable need to sync");
-    AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
+    AddInput("X", "(Tensor) Dependency of the variable need to sync")
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor) Dependency of the variable need to sync")
+        .AsDuplicable();
     AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
     AddComment(R"DOC(
 CSyncCommStream Operator
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index 22b212fc1b9f8844f0ae3555ac6d63af1f48d1cd..ce622d7501f90d0b6d1e9f4f50627fe0038f013e 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
-
 #include <algorithm>
 #include <vector>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+}  // namespace framework
+}  // namespace paddle
+
 // This file almostly contains all the infershape functions that are used in
 // operators.
 
diff --git a/paddle/fluid/operators/common_infer_shape_functions.h b/paddle/fluid/operators/common_infer_shape_functions.h
index 2cb9eab2865ce068a4f776bc63070c59bf029481..922d5262abc42dfb19f72ddaa47c85e448b84208 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.h
+++ b/paddle/fluid/operators/common_infer_shape_functions.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+}  // namespace framework
+}  // namespace paddle
+
 // This file almostly contains all the infershape functions that are used in
 // operators.
 
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4f337c03599a548ac3d95ddd06c726be30d7c13f..7937e432d22faa3ffd93e46a39b7b1cc5500dbf8 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -78,7 +79,8 @@ class ConcatOp : public framework::OperatorWithKernel {
       }
     }
     if (flag == 0) {
-      PADDLE_THROW("All Inputs of Concat OP are Empty!");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "All Inputs of Concat OP are Empty!"));
     }
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
index aa31b8875620d967f115a4d18b5022ca3eb21804..62019be26cdef8214fe0e7c3e063c9387a30c91a 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
@@ -14,6 +14,18 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index 659e098c6dd20840e0efcea5ad0c5ee37300f4f4..c8ab2c91e912241457a8ab2f5455ef886b3b18ae 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
index 155a5dbfec31f3a689fe199827f8ccf1c783d4e6..00b86121c0ddab5d53bf4c417ce27885ae0deb88 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
@@ -13,12 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
+
 #include <string>
 #include <unordered_set>
 #include <utility>
-#include <vector>
+
 #include "paddle/fluid/operators/controlflow/op_variant.h"
 
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
index f7dfba6f364e197a97cc5e061e42cd5cc84309db..abaaa8976065ca5636369036a0701d3d5bdfab00 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
@@ -16,9 +16,16 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_test.cc b/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
index a5ca4a289deb1eb9a4666e84775bb26c0bdf030c..068d853f1cc7259c54defa8ee9519aa4e5e1e98e 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
@@ -13,14 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
-#include <memory>
-#include <string>
-#include <vector>
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type.h"
 
 USE_NO_KERNEL_OP(conditional_block);
 USE_NO_KERNEL_OP(conditional_block_grad);
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 088413ce223594ac8a32f0d06662dd3b8342c096..9597dd25ec530f6f2cb32ee12a08b05feb5b8bfa 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -12,10 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class FeedOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 4ab9f9fc863a6979496707ba4cc6794a56c37b30..e8829e1e1fa4628ac1c1989e91d6ee8048eb2a3d 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>  // NOLINT
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
index 9af993f1006c1e0107951c89460abe76c1561064..28c27437de12e37eaa94cef677fe3c9fc292c7c7 100644
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
@@ -20,6 +20,13 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class OperatorBase;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
index f2973add84e89f931aaec09515c17cf5725d8759..c96b7c6a08c79c188d8254282d1e29b4ec105c03 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
@@ -13,14 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
-
 #include <algorithm>
 #include <string>
-#include <unordered_set>
-#include <utility>
 
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/recurrent_op.h"
+namespace paddle {
+namespace framework {
+class BlockDesc;
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.h b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
index aacca0762ca1d45634d36da572448dae7e9fe195..3d9404e57aa2715fe893879ac82735dc09006875 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
@@ -26,6 +26,12 @@
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/fluid/string/string_helper.h"
 
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index 9f7702a5d6b63cc689535f2f1c880058e6211709..c4451c3b583c72fc6b7b1d4a257bd484fe9e3e53 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -12,7 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index e4a1397f5c426a31b19c04aa8fb97c516a759c36..b85e740ada9bde7c7a9f4190082cf91def37f8cf 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -12,14 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <vector>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index 4f9d93c91f6f3f167fcda66d25addd0b6ac3ec6d..d2e9953e6477ad10c297adca13f4d8604a1f6f5d 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -17,9 +17,17 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 25b45f281a799ade12ec9cbfb8fb262dbc572196..fac8e24251033c301c911f35dcfd0ddb82b713ce 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -162,7 +162,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
       workspace_size = GetWorkspaceSize(args, algo);
 
       if (workspace_size > workspace_size_limit) {
+#if CUDNN_VERSION >= 8000
         workspace_size_limit = workspace_size;
+#else
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size << ") exceeds the limit("
+                << workspace_size_limit << ")";
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+                args.handle, args.idesc.desc(), args.wdesc.desc(),
+                args.cdesc.desc(), args.odesc.desc(),
+                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &algo));
+#endif
       }
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -291,8 +304,23 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 #endif
       workspace_size = GetWorkspaceSize(args, algo);
       if (workspace_size > workspace_size_limit) {
-        workspace_size_limit = workspace_size;
         has_got_workspace_size = false;
+#if CUDNN_VERSION >= 8000
+        // There is no cudnnGetConvolutionBackwardDataAlgorithm in CUDNN 8
+        // version.
+        workspace_size_limit = workspace_size;
+#else
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size << ") exceeds the limit("
+                << workspace_size_limit << ")";
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+                args.handle, args.wdesc.desc(), args.odesc.desc(),
+                args.cdesc.desc(), args.idesc.desc(),
+                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &algo));
+#endif
       }
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 00af724ac7fce64b9a210bf43a150acf20f34dce..f8b76f387cc1954cc4329daf04a6bdcab5c0775f 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -50,8 +50,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     const Tensor* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -60,14 +61,16 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
+
     bool exhaustive_search =
         FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+    bool deterministic = FLAGS_cudnn_deterministic;
+    auto exhaustive_deterministic = exhaustive_search && deterministic;
+    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
+                      platform::errors::InvalidArgument(
+                          "Cann't set exhaustive_search True and "
+                          "FLAGS_cudnn_deterministic True at same time."));
 
-    if (exhaustive_search && FLAGS_cudnn_deterministic) {
-      PADDLE_THROW(
-          "Cann't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
     const std::string padding_algorithm =
         ctx.Attr<std::string>("padding_algorithm");
     const std::string data_format = ctx.Attr<std::string>("data_format");
@@ -197,7 +200,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
               &transformed_input);
         } break;
         default:
-          PADDLE_THROW("ConvOp only support tensors with 4 or 5 dimensions.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "ConvOp only support tensors with 4 or 5 dimensions."));
       }
 
     } else {
@@ -317,8 +321,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -337,14 +342,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
     int groups = ctx.Attr<int>("groups");
+
     bool exhaustive_search =
         FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
     bool deterministic = FLAGS_cudnn_deterministic;
-    if (exhaustive_search && deterministic) {
-      PADDLE_THROW(
-          "Can't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
+    auto exhaustive_deterministic = exhaustive_search && deterministic;
+    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
+                      platform::errors::InvalidArgument(
+                          "Cann't set exhaustive_search True and "
+                          "FLAGS_cudnn_deterministic True at same time."));
+
     const std::string data_format = ctx.Attr<std::string>("data_format");
     const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
@@ -495,7 +502,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
               &transformed_input);
         } break;
         default:
-          PADDLE_THROW("ConvOp only support tensors with 4 or 5 dimensions.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "ConvOp only support tensors with 4 or 5 dimensions."));
       }
     } else {
       transformed_input.ShareDataWith(transformed_input_channel);
@@ -701,8 +709,9 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto X = ctx.Input<Tensor>("Input");
     auto W = ctx.Input<Tensor>("Filter");
     auto dO = ctx.Input<Tensor>("DOutput");
@@ -736,14 +745,16 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
+
     bool exhaustive_search =
         FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
     bool deterministic = FLAGS_cudnn_deterministic;
-    if (exhaustive_search && deterministic) {
-      PADDLE_THROW(
-          "Can't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
+    auto exhaustive_deterministic = exhaustive_search && deterministic;
+    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
+                      platform::errors::InvalidArgument(
+                          "Cann't set exhaustive_search True and "
+                          "FLAGS_cudnn_deterministic True at same time."));
+
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
 
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
@@ -878,7 +889,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
           }
         } break;
         default:
-          PADDLE_THROW("ConvOp only support tensors with 4 or 5 dimensions.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "ConvOp only support tensors with 4 or 5 dimensions."));
       }
 
     } else {
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 8a5345e3cf8d9f1c657fe2996015af4dc038a1bf..662fac9e77e023d2e1b173caa5a9769b56eaf0c4 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -685,8 +685,9 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "It must use CPUPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
     const Tensor* X = ctx.Input<Tensor>("Input");
     const Tensor* dY = ctx.Input<Tensor>("DOutput");
     const Tensor* ddX = ctx.Input<Tensor>("DDInput");
@@ -982,11 +983,20 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           output->dims()[output->dims().size() - 1] %
               input->dims()[input->dims().size() - 1],
-          0, "The output channels must be a multiple of the input channels");
+          0, platform::errors::InvalidArgument(
+                 "ShapeError: The output channels must be a multiple of the "
+                 "input channels. But receivced output channel number is %d "
+                 "and input channel number is %d",
+                 output->dims()[output->dims().size() - 1],
+                 input->dims()[input->dims().size() - 1]));
     } else {
       PADDLE_ENFORCE_EQ(
           output->dims()[1] % input->dims()[1], 0,
-          "The output channels must be a multiple of the input channels");
+          platform::errors::InvalidArgument(
+              "ShapeError: The output channels must be a multiple of the "
+              "input channels. But receivced output channel number is %d "
+              "and input channel number is %d",
+              output->dims()[1], input->dims()[1]));
     }
     // transform tensor
     Tensor transformed_input(input->type());
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 99ec1e048101b281e71005f6fde328c664ba66be..5249264b1c9bcf13c5ee8227828087659de5254b 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -51,8 +51,9 @@ template <typename T>
 class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -145,9 +146,8 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
               ctx, input_pad, input_transpose, pad_value, &transformed_input);
         } break;
         default:
-          PADDLE_ENFORCE_EQ(
-              rank == 4 || rank == 5, true,
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
       }
     } else {
       transformed_input = input_transpose;
@@ -290,8 +290,9 @@ template <typename T>
 class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -393,9 +394,8 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
               &transformed_output_grad);
         } break;
         default:
-          PADDLE_ENFORCE_EQ(
-              rank == 4 || rank == 5, true,
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
       }
     } else {
       transformed_output_grad = output_grad_transpose;
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 59b3677acc41658936dc678d9810c923a80bf6e1..1ea869e002af3ac8157321c66616b82517e4fabc 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -580,7 +580,12 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(context.GetPlace());
 
     int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
+    PADDLE_ENFORCE_EQ(
+        groups, filter.dims()[0],
+        platform::errors::InvalidArgument(
+            "groups should be error to the 1st dimension of filter. But "
+            "received groups is %d and filter dimension[0] is %d",
+            groups, filter.dims()[0]));
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
@@ -588,7 +593,10 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
     std::string padding_algorithm =
         context.Attr<std::string>("padding_algorithm");
     for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1);
+      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
+                                  "dilations should be 1 in depthwise conv. "
+                                  "But received dilations is %d",
+                                  v));
     }
 
     auto in_dims = input->dims();
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 6457d9295dcbfa99d18f63fbda3dae048d7713cd..6ac75b78d7058eda4bf3564c184a8b0588a40578 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -16,8 +16,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
index ec60569be200a4ed09aff046233f9977f47b5195..aa5649e4e9ce29f831f7f88a36c6e0ec70162929 100644
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@@ -12,6 +12,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class DeleteVarOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index 48743f2e48c8a7686497adff52f23f31346aeda7..c8bca25b6b0f0e34fbfad5c2192faac24bd22ffa 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -13,8 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_abs_max_op.h"
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
@@ -45,10 +60,8 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of DequantizeMaxAbsOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of DequantizeMaxAbsOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DequantizeMaxAbs");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DequantizeMaxAbs");
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.h b/paddle/fluid/operators/dequantize_abs_max_op.h
index 796ca93b0008180d602afe8e9c73efa7a749bc66..0d9d20fc120ca43e3127aa6a18abb475bd2af5b4 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.h
+++ b/paddle/fluid/operators/dequantize_abs_max_op.h
@@ -15,10 +15,17 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
@@ -35,6 +42,7 @@ class DequantizeMaxAbsKernel : public framework::OpKernel<T> {
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* scale = ctx.Input<framework::Tensor>("Scale");
+
     auto* out = ctx.Output<framework::Tensor>("Out");
 
     float max_range = ctx.Attr<float>("max_range");
diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc
index 2ecd54f7edde63ab5f5256694117cfb15be69384..c12dd9e6d218a251850eb828668a8613fe0ab3b5 100644
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -13,9 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_log_op.h"
-#include <math.h>
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dequantize_log_op.h b/paddle/fluid/operators/dequantize_log_op.h
index f6590ecf61fe6e4c0ad1b667bb5c712b36a1bc45..67ce9cc84d3a8548e87e6554129f82baae2e2009 100644
--- a/paddle/fluid/operators/dequantize_log_op.h
+++ b/paddle/fluid/operators/dequantize_log_op.h
@@ -15,9 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index b46d231d0ff7774c64745b3b77953cf2ed8d82f7..6b1b0cd8b3578a344978afae642b66759589ffde 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -532,7 +532,8 @@ static int count_contours(polygon_node *polygon) {
 }
 
 static void add_left(polygon_node *p, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   vertex_node *nv = NULL;
 
   /* Create a new vertex node and set its fields */
@@ -588,7 +589,8 @@ static void add_right(polygon_node *p, double x, double y) {
 }
 
 static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   polygon_node *target = NULL;
 
   /* Label contour as external */
@@ -664,7 +666,8 @@ void add_vertex(vertex_node **t, double x, double y) {
 }
 
 void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(e);
+  PADDLE_ENFORCE_NOT_NULL(e, paddle::platform::errors::InvalidArgument(
+                                 "Input edge node is nullptr."));
   add_vertex(&(e->outp[p]->v[s]), x, y);
   e->outp[p]->active++;
 }
@@ -693,7 +696,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
 
   gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                    const_cast<char *>("Bounding box creation"));
-  PADDLE_ENFORCE_NOT_NULL(box);
+  PADDLE_ENFORCE_NOT_NULL(box, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc box memory."));
 
   /* Construct contour bounding boxes */
   for (c = 0; c < p->num_contours; c++) {
@@ -857,7 +861,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
   /* Create an extended hole array */
   gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                   const_cast<char *>("contour hole addition"));
-  PADDLE_ENFORCE_NOT_NULL(extended_hole);
+  PADDLE_ENFORCE_NOT_NULL(extended_hole,
+                          paddle::platform::errors::ResourceExhausted(
+                              "Failed to malloc extended hole memory."));
 
   /* Create an extended contour array */
   gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -975,7 +981,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
+
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1017,7 +1025,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e0 = aet;
     e1 = aet;
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
+
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
@@ -1612,7 +1622,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1650,7 +1661,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e1 = aet;
 
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
index 4386cc6b8183c03b4d4a19aba7d1126eac2ab495..12ea31945f8d032e1f395c2fb92d9ef31d10c7e8 100644
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <tuple>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_v2_op.h"
 
@@ -58,6 +59,17 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
     auto out_dims = out->dims();
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
+    auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+      const int64_t block_size =
+          std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+      int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
+                                          static_cast<int64_t>(1));
+      const int64_t grid_size =
+          std::min(max_blocks, (size + block_size - 1) / block_size);
+      return std::tuple<int64_t, int64_t>{block_size, grid_size};
+    };
+
     if (x_dims.size() == 1) {
       float padding_value = context.Attr<float>("padding_value");
       math::SetConstant<DeviceContext, T> set_padding_value;
@@ -67,26 +79,23 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
       auto size = (offset > 0) ? x_length + offset : x_length - offset;
       const int& x_stride = ComputeStride(0, x_dims);
       if (size > 0) {
-        const int block_num = std::min(static_cast<int>(size),
-                                       dev_ctx.GetMaxPhysicalThreadCount());
-        int size_ = static_cast<int>(size);
-        int block_num_ = static_cast<int>(block_num);
-        const int grid_num =
-            std::min(1024, (size_ + block_num_ - 1) / block_num_);
         const auto& out_stride_0 = ComputeStride(0, out_dims);
         const auto& out_stride_1 = ComputeStride(1, out_dims);
         auto start =
             (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
 
-        PasteDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
-            out_data, x_data, start, x_length, out_stride_0 + out_stride_1,
-            x_stride);
+        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+        PasteDiagonalKernel<
+            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
+                 dev_ctx.stream()>>>(out_data, x_data, start, x_length,
+                                     out_stride_0 + out_stride_1, x_stride);
       }
     } else {
       const int& x_stride_0 = ComputeStride(0, x_dims);
       const int& x_stride_1 = ComputeStride(1, x_dims);
 
-      int size;
+      int64_t size;
       if (offset > 0) {
         size = std::min(x_dims[0], x_dims[1] - offset);
       } else {
@@ -94,18 +103,15 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
       }
 
       if (size > 0) {
-        const int block_num = std::min(static_cast<int>(size),
-                                       dev_ctx.GetMaxPhysicalThreadCount());
-        int size_ = static_cast<int>(size);
-        int block_num_ = static_cast<int>(block_num);
-        const int grid_num =
-            std::min(1024, (size_ + block_num_ - 1) / block_num_);
         auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
         const auto& out_stride_0 = ComputeStride(0, out_dims);
 
-        ExtractDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
-            out_data, x_data, start, size, x_stride_0 + x_stride_1,
-            out_stride_0);
+        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+        ExtractDiagonalKernel<
+            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
+                 dev_ctx.stream()>>>(out_data, x_data, start, size,
+                                     x_stride_0 + x_stride_1, out_stride_0);
       }
     }
   }
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
index eadd842c7f6ead56006fd0c34814b1b7bd9b62f4..cef6590ae21e84d16143e149820cd5ad033808bf 100644
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
@@ -23,8 +24,6 @@
 #include <utility>
 #include <vector>
 
-#include <ThreadPool.h>
-
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
index 17d8973303ba6b2ec8d39179e3052d184d44464c..2d78559625c91fadec1fbb282b08e542a07d964a 100644
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
@@ -13,9 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-
 #include <algorithm>
-
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
index 6d3f53431113621fc859eda8e7448383772d20a3..fcd3e6abead510393736a9253af2ae1068357a68 100644
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ b/paddle/fluid/operators/distributed/collective_client.cc
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <condition_variable>  // NOLINT
-#include <string>
-#include "gflags/gflags.h"
-
 #include "paddle/fluid/operators/distributed/collective_client.h"
+#include <memory>
+#include "gflags/gflags.h"
 
 DECLARE_int32(rpc_deadline);
 
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
index 6a3a450a1fd2e52c341f824f4816ca13784bda85..e7d8bb8df9834728682ea131f3ef0d60786908e5 100644
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
@@ -15,16 +15,27 @@
 #pragma once
 
 #include <condition_variable>  // NOLINT
+#include <memory>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class SelectedRows;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_int32(rpc_deadline);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
index c95652400c27acd406ca3f70a0dfa8d329e94358..cdd37742d2d5a5a882320cbff1e67a353b4af5f8 100644
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ b/paddle/fluid/operators/distributed/collective_server.cc
@@ -12,14 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
 #include "paddle/fluid/operators/distributed/collective_server.h"
+#include <memory>
 
 DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
 
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
index 03c688a78e1cbaba4afe4585e619956188a767a1..1015007ba0c566499e3459758cf3a911f661cee0 100644
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <map>
+#include <memory>
 #include <set>
 #include <string>
 #include <thread>  // NOLINT
@@ -22,12 +23,17 @@ limitations under the License. */
 #include <vector>
 
 #include "gflags/gflags.h"
-
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index be8c7a7dd40697d4abb8e53215ce09ae6619f18e..92b2eb4b51e59fec0991712ec4f6d6829b76cfb4 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -13,22 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdlib.h>
-#include <unistd.h>
 #include <memory>
 #include <string>
-#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index b2cc9390fa2267404ac246c6b36800833d0dd679..a0ac82a6f4a432ee0f0427a90508c88a262799e3 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -74,8 +74,12 @@ void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
   } else {
     recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
   }
+
+  InitParams();
 }
 
+void AsyncCommunicator::InitParams() { RecvNoBarrier(); }
+
 AsyncCommunicator::~AsyncCommunicator() {
   running_ = false;
   if (main_thread_) main_thread_->join();
@@ -157,16 +161,18 @@ void AsyncCommunicator::MainThread() {
   }
 
   while (running_) {
-    int meet = Meet();
-
-    VLOG(1) << "async_meet: " << meet;
-
-    SendGlobalStep(meet);
-    SendByCommunicator(meet);
-    BarrierSend();
-    RecvByCommunicator();
-    BarrierRecv();
-    BarrierWeakUp();
+    int batches = BatchesCounter();
+
+    if (batches > 0) {
+      SendGlobalStep(batches);
+      SendByCommunicator(batches);
+      BarrierSend();
+      RecvByCommunicator();
+      BarrierRecv();
+      BarrierWeakUp();
+    } else {
+      VLOG(1) << "get nothing from sending queue, will skip send/recv";
+    }
   }
   VLOG(1) << "communicator stopped, send thread exit";
 }
@@ -187,7 +193,7 @@ void AsyncCommunicator::RecvNoBarrier() {
       auto &var_name = iter.first;
       VLOG(4) << "recv var " << var_name;
       auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(iter.second, *recv_scope_, false);
+      recv_functor(iter.second, *recv_scope_);
     };
     task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
   }
@@ -197,7 +203,7 @@ void AsyncCommunicator::RecvNoBarrier() {
   }
 }
 
-int AsyncCommunicator::Meet() {
+int AsyncCommunicator::BatchesCounter() {
   auto &step_queue = send_varname_to_queue_.at(STEP_COUNTER);
 
   size_t merged_var_num = 0;
@@ -316,7 +322,7 @@ void HalfAsyncCommunicator::Clean() {
   }
 }
 
-int HalfAsyncCommunicator::Meet() {
+int HalfAsyncCommunicator::BatchesCounter() {
   while (running_) {
     if (barrier_counter_.load() >= barrier_trigger_.load() &&
         barrier_trigger_.load() != 0) {
@@ -443,7 +449,7 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
   old_scope_.reset(new Scope());
   pserver_scope_.reset(new Scope());
 
-  Init();
+  InitParams();
 }
 
 void GeoCommunicator::Send(const std::vector<std::string> &var_names,
@@ -626,9 +632,7 @@ void GeoCommunicator::RecvByCommunicator() {
       if (recv_ctx.is_sparse) {
         RecvSparse(var_name);
       } else {
-        VLOG(1) << "recv dense " << var_name << " begin";
         RecvDense(var_name);
-        VLOG(1) << "recv dense " << var_name << " done";
       }
     };
     tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
@@ -696,7 +700,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
 
   auto &ctx = recv_varname_to_ctx_.at(varname);
   auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *pserver_scope_, true);
+  recv(ctx, *pserver_scope_);
 
   PADDLE_ENFORCE_EQ(
       var_psrever->IsInitialized(), true,
@@ -721,7 +725,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
              t_timestamp->data<float>());
 }
 
-void GeoCommunicator::Init() {
+void GeoCommunicator::InitParams() {
   std::vector<std::future<void>> tasks;
   tasks.reserve(recv_varname_to_ctx_.size());
 
@@ -744,12 +748,17 @@ void GeoCommunicator::Init() {
 }
 
 void GeoCommunicator::InitDense(const std::string varname) {
-  auto *var = old_scope_->Var(varname);
-  var->GetMutable<framework::LoDTensor>();
-
   auto &ctx = recv_varname_to_ctx_.at(varname);
   auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *old_scope_);
+  recv(ctx, *recv_scope_);
+
+  auto *global_var = recv_scope_->FindVar(varname);
+  global_var->GetMutable<framework::LoDTensor>();
+
+  auto *old_var = old_scope_->Var(varname);
+  old_var->GetMutable<framework::LoDTensor>();
+
+  framework::CopyVariable(*global_var, old_var);
   VLOG(1) << "init dense variable " << varname << " done";
 }
 
@@ -781,22 +790,41 @@ void GeoCommunicator::InitSparse() {
 
   LargeScaleKV::Init(metas);
 
-  for (size_t i = 0; i < metas.size(); i++) {
-    auto &varname = metas[i].name;
-    auto &dict = dicts[i];
+  for (auto &meta : metas) {
+    auto &ctx = recv_varname_to_ctx_.at(meta.name);
+    auto recv = distributed::ParameterRecv<float>();
 
-    std::vector<int64_t> ids;
-    ids.reserve(dict);
+    auto *global_var = recv_scope_->FindVar(meta.name);
+    auto global_value = global_var->Get<framework::LoDTensor>();
+    auto rows = global_value.dims()[0];
+    auto dim1 = global_value.dims()[1];
 
-    for (auto j = 0; j < dict; ++j) {
-      ids.push_back(j);
-    }
+    recv(ctx, *recv_scope_);
+    VLOG(1) << "recv " << meta.name << " with global scope for init";
+
+    auto n_rows = global_var->Get<framework::LoDTensor>().dims()[0];
+
+    PADDLE_ENFORCE_EQ(
+        rows, n_rows,
+        platform::errors::InvalidArgument(
+            "global var: %s origin dim must equal recved rows", meta.name));
+
+    std::vector<int64_t> ids(rows);
+    std::iota(ids.begin(), ids.end(), 0);
 
     auto *ins = distributed::LargeScaleKV::GetInstance();
-    ins->Get(varname)->Init(ids);
+    std::vector<std::vector<std::vector<float> *>> values;
+
+    ins->Get(meta.name)->Init(ids);
+    ins->Get(meta.name)->Get(ids, {"Param"}, &values);
 
-    VLOG(3) << "GeoCommunicator init sparse " << varname << " with size "
-            << ids.size();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, float>(
+        paddle::platform::CPUDeviceContext());
+
+    for (auto &id : ids) {
+      blas.VCOPY(dim1, global_value.data<float>() + id * dim1,
+                 values[id][0]->data());
+    }
   }
 
   VLOG(3) << "init sparse variable done";
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 2f6da150d1e1375c332f7e55ea5b16c07f067a40..07fd4ed496000d949e11663b7c2b758f9f23587d 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -15,20 +15,23 @@ limitations under the License. */
 #pragma once
 
 #include <ThreadPool.h>
+#include <stdint.h>
 #include <atomic>
 #include <deque>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/large_scale_kv.h"
@@ -279,6 +282,8 @@ class AsyncCommunicator : public Communicator {
                 const RpcCtxMap &recv_varname_to_ctx,
                 Scope *recv_scope) override;
 
+  void InitParams();
+
   void MainThread();
 
   void Send(const std::vector<std::string> &var_names,
@@ -293,7 +298,7 @@ class AsyncCommunicator : public Communicator {
 
   virtual void RecvNoBarrier();
 
-  virtual int Meet();
+  virtual int BatchesCounter();
 
   virtual void BarrierSend() {}
 
@@ -350,7 +355,7 @@ class HalfAsyncCommunicator : public AsyncCommunicator {
 
   void BarrierTriggerReset(int initial_val) override;
 
-  int Meet();
+  int BatchesCounter();
 
   void BarrierWeakUp();
 
@@ -435,7 +440,7 @@ class GeoCommunicator : public AsyncCommunicator {
 
   void RecvDense(const std::string &varname);
 
-  void Init();
+  void InitParams();
 
   void InitSparse();
 
diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc
index b9a6afa4cc351b59e5bd91e483bd654a7aade041..38b7c8b00317e6880434e975438c72ba9248aee2 100644
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ b/paddle/fluid/operators/distributed/communicator_test.cc
@@ -12,11 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <vector>
 
 #include "paddle/fluid/operators/distributed/communicator.h"
 
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
index c2cb0d7f04eb67275030e841740f0cdb291d9f87..7d6756b41363d12af68402817cfee1df408b8827 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
index e9074574cdd163bbf7e62939df9283352706f840..486870de7a554e675bb01492e775654bbcb34da3 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
@@ -26,8 +26,12 @@ limitations under the License. */
 #include "grpc++/grpc++.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
+struct grpc_byte_buffer;
+
 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
+class ByteBuffer;
+
 class GrpcBufferReader final
     : public ::google::protobuf::io::ZeroCopyInputStream {
   typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index 6b6249540c6d15954743c414a60472bf1f831151..22ca74a67e72b44216348e8df3914d2d23ae75c3 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <time.h>
 #include <atomic>
-
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <ctime>
@@ -47,6 +46,18 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
+namespace grpc {
+class Channel;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index eddd89cf20c2eb91e88d666a6ffe4a045df7298b..0372846ce0debeba6614fd0aac30fd9c1561f763 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -17,19 +17,26 @@ limitations under the License. */
 #endif
 #include <limits>
 #include <memory>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
+#include "grpcpp/impl/codegen/byte_buffer.h"
+#include "grpcpp/impl/codegen/slice.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
index 30e6907656e25bc7bcae77d3bd02638f6bb7601d..932f3e2f069a2bfe1dec9318446e1bf064d2e317 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
@@ -24,10 +24,22 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"
 
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 5c0232a50a9066f782be5269b4041958748c2e23..47e114ff4b235a39d29f7061c8dbd7db1c9eb09a 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -20,6 +20,20 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 
+namespace grpc {
+class ChannelArguments;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+namespace operators {
+namespace distributed {
+class GRPCVariableResponse;
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
 using ::grpc::ServerAsyncResponseWriter;
 
 DECLARE_bool(rpc_disable_reuse_port);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
index ee6950205b31d9e2d3cd8722daf1c12117a17029..3d68b7e8cebb400680458a1163d52b01f8c8dc2e 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h
@@ -37,6 +37,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
+namespace grpc {
+class ServerCompletionQueue;
+}  // namespace grpc
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
index e46d2fbe01c16a8b0cbf402f6ffff4907adf7356..7d7723f19454c0c5c03b26a0737bcc8241c94889 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
@@ -12,16 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <stdint.h>
 #include <string>
 #include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
 
+#include "google/protobuf/io/coded_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
+namespace google {
+namespace protobuf {
+namespace io {
+class ZeroCopyInputStream;
+}  // namespace io
+}  // namespace protobuf
+}  // namespace google
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
index 3ca1d89f750313791c833a9f1f58760406e690c2..4d12b4a4bacd7ffee6ac7725951b967f7eb2da15 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
@@ -16,19 +16,30 @@
 
 #include <string>
 
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
index 6736ea4336ba40c077352dac9f7f0ff3660ff3b1..84ba9793c4e00a59081e28dc63506ba2ac995de5 100644
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include <chrono>  // NOLINT
+
 #include <ctime>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h
index 211e40757fc752ecd8846a45fae73db143f481e3..cfef492de0e16dfe75fa0c331ce35a6e9aaf81b5 100644
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.h
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.h
@@ -14,21 +14,18 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <gflags/gflags.h>
-
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <string>
+#include <thread>  // NOLINT
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include <thread>  // NOLINT
-
-#include <ThreadPool.h>
-
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
index 699c03f6f288919b2e1ab622e9be8283dce4e808..7c2c0fbff11befa353a024241c99b0a7f6144566 100644
--- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
@@ -14,9 +14,6 @@
 
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
 
-#include <algorithm>
-#include <thread>  // NOLINT
-
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index 9e39e68cba779de4dc598046e45f7d35e292bb79..b4388c0002a78b3ad66efeeb9fee2ad3c7f65c1e 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -16,7 +16,6 @@
 
 #include <ThreadPool.h>
 #include <gflags/gflags.h>
-
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index a9378d61c3ca39bd43b558633cc4d04c40175cac..6b33c1f5fcdca85271c3d4110e057523b5415fb6 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -12,30 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
 #include <memory>
 #include <set>
-#include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCClient;
+
 using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index 8605bcdcd86759d5c5b45fdcbb1e68407621fc08..6fd3a998813c0ba32b8b694b6655e1c73f45d62b 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -20,6 +20,13 @@
 
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 5409ec54987fbb7ad89f61cc1655a4c3ef302ac0..a91df5b3c471e234dd1ae72771c287e21ebf7af0 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -12,37 +12,95 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sys/types.h>
 #include <algorithm>
 #include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
 
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
+#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/operators/distributed/parameter_recv.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCClient;
+
 using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
 template <typename T>
-void RecvSelectedRows(const CommContext &rpc_ctx,
-                      const framework::Scope &scope) {
+void RecvSparseLodTensor(const CommContext &rpc_ctx,
+                         const framework::Scope &scope) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto cpu_place = platform::CPUPlace();
+  auto &cpu_ctx = *pool.Get(cpu_place);
+
+  distributed::RPCClient *rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
+
+  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
+  std::vector<const float *> tensors;
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
+    auto &recv_var_name = rpc_ctx.splited_varnames[i];
+    auto *local_var = local_scope->Var(recv_var_name);
+    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
+    // sparse param in recv_scope is LoDTensor
+    rets.push_back(rpc_client->AsyncGetVarNoBarrier(
+        rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
+        recv_var_name));
+
+    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
+    tensors.push_back(value);
+  }
+
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
+                                               "internal error in RPCClient"));
+  }
+
+  auto *merged_var = scope.FindVar(rpc_ctx.var_name);
+
+  if (merged_var == nullptr || !merged_var->IsInitialized()) {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("%s must initialized at first."));
+  }
+  auto dims1 = merged_var->Get<framework::LoDTensor>().dims()[1];
+  int64_t height = 0;
+  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
+    auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]);
+    height += splited_var->Get<framework::LoDTensor>().dims()[0];
+  }
+
+  PADDLE_ENFORCE_EQ(merged_var->Get<framework::LoDTensor>().dims()[0], height,
+                    "recved var must has same dims with local var");
+
+  auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
+  auto *merged_d = merged_t->mutable_data<float>(cpu_place);
+
+  auto pserver_num = rpc_ctx.splited_varnames.size();
+  for (int x = 0; x < height; ++x) {
+    auto id = x % pserver_num;
+    auto idx = x / pserver_num;
+    std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1,
+                sizeof(float) * dims1);
+  }
+}
+
+template <typename T>
+void RecvGeoSparseRecords(const CommContext &rpc_ctx,
+                          const framework::Scope &scope) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto cpu_place = platform::CPUPlace();
   auto &cpu_ctx = *pool.Get(cpu_place);
@@ -84,9 +142,14 @@ void RecvSelectedRows(const CommContext &rpc_ctx,
     ids_num += recv_t.rows().size();
     width = recv_t.value().dims()[1];
 
-    std::transform(recv_t.rows().begin(), recv_t.rows().end(),
-                   std::back_inserter(all_ids),
-                   [&](int64_t id) { return id * pserver_num + i; });
+    if (rpc_ctx.is_distributed) {
+      std::copy(recv_t.rows().begin(), recv_t.rows().end(),
+                std::back_inserter(all_ids));
+    } else {
+      std::transform(recv_t.rows().begin(), recv_t.rows().end(),
+                     std::back_inserter(all_ids),
+                     [&](int64_t id) { return id * pserver_num + i; });
+    }
   }
 
   auto *var = scope.FindVar(rpc_ctx.var_name);
@@ -146,7 +209,8 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
 
 template <typename T>
 void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope, bool barrier) {
+                                  const framework::Scope &scope,
+                                  bool geo_records) {
   VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
 
   PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1,
@@ -154,18 +218,21 @@ void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
                         "origin_varnames.size() >= 1 is permitted"));
 
   if (rpc_ctx.is_sparse) {
-    RecvSelectedRows<T>(rpc_ctx, scope);
+    if (geo_records) {
+      RecvGeoSparseRecords<T>(rpc_ctx, scope);
+    } else {
+      RecvSparseLodTensor<T>(rpc_ctx, scope);
+    }
   } else {
     RecvLodTensor<T>(rpc_ctx, scope);
   }
 
   VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
 }
-
 template <typename T>
 void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
                                   const framework::Scope &scope) {
-  this->operator()(rpc_ctx, scope, true);
+  this->operator()(rpc_ctx, scope, false);
 }
 
 template struct ParameterRecv<float>;
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 545b1f5e803c60f8c68005849336e1d3e4893df7..99af06bc7cc6f83fda0714f6ed13a2afb5641e8e 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -14,26 +14,31 @@
 
 #include "paddle/fluid/operators/distributed/parameter_send.h"
 #include <memory>
-#include <set>
-#include <string>
 #include <utility>
-#include <vector>
-
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
+#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCClient;
+
 using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 42621724e68f40617bebd2b01e2af5dd23387163..6d239673f9104131c3129ea822e5c9f892845ea1 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include <time.h>
-
 #include <functional>
 #include <memory>
 #include <string>
@@ -33,6 +32,13 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 69a5e3274318337f5424afa6492da829e04daa69..6a6a795a46b52c6b0ea289ee3e15c827af8688cf 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -17,13 +17,22 @@
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <string>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_int32(rpc_deadline);
 DECLARE_int32(rpc_retry_times);
 
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 691c2c1048f77866fe3e0e6a22cf6a6daebf0040..52b4456f7b1eb904ddf85dc6e8f3088964da444a 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -15,15 +15,23 @@
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 
 #include <fstream>
-#include <iostream>
-#include <limits>
 #include <string>
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RequestHandler;
+
 void RPCServer::ShutDown() {
   VLOG(3) << "RPCServer ShutDown ";
   ShutDownImpl();
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index 8c7b7f1d7eeeced24d2ade2bcff4261b24587624..f83144f626881642e8da2685841881c2bff86390 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -18,6 +18,7 @@
 #include <set>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -25,10 +26,21 @@
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RequestHandler;
+
 struct MonomerHandle {
   std::string var_name_;
   std::string rpc_name_;
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index b52fb93e5bf7fb27faa0ccc4e2fb4b08112ba6b9..2e9d958ebfba6f4211e7af886af22d92487542f1 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -11,17 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
 #include <memory>
-#include <thread>  // NOLINT
 
-#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
 
 DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
 DEFINE_int32(rpc_retry_bind_port, 3,
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 5457101a5c9f3eb22f76877676f4a8a750a0f914..8a382baa5be7c15a1bd4215ef4702e96e0045f85 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <iostream>
+#include <memory>
 #include <string>
 #include <typeindex>
 #include <vector>
@@ -27,6 +28,21 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/port.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
index a0fcaf886475c5e03d959ffd6af22b2123526b9f..7c52ef74b4c2e5e060a2a46ea8c2ebc727f0cbae 100644
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
@@ -12,11 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index d979cd8a881ec7d697eae06b4911d597730b6908..be67a2396f7d713aad3b6bc46210cd9345d3dc55 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -16,17 +16,33 @@
 
 #include <string>
 
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/distributed_pb.h"
 
+namespace google {
+namespace protobuf {
+namespace io {
+class CodedInputStream;
+class ZeroCopyInputStream;
+}  // namespace io
+}  // namespace protobuf
+}  // namespace google
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_string(rpc_server_profile_path);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
index 2ed2acb96dc842b6a60bf31701d39ac94dab9804..abc8d912840dba6f3da8acd3a911769c8748b2a8 100644
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
@@ -9,15 +9,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index b064265917b2a36b2261c6c43d355f9891aa9811..755cbf017d9d4be5e4d75024e29b09beecf53db4 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -12,19 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class FetchBarrierOp : public framework::OperatorBase {
  public:
   FetchBarrierOp(const std::string& type,
@@ -48,7 +58,9 @@ class FetchBarrierOp : public framework::OperatorBase {
     }
 
     for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
+      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
+                        platform::errors::Unavailable(
+                            "Internal error occurred in RPCClient."));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
index 1199a63d16a4ecddf04eef468aea42d147608783..25ad16e3fce37837838af5360f79b9bece337373 100644
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
@@ -31,9 +31,25 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Executor;
+class ProgramDesc;
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCServer;
+class RequestHandler;
+}  // namespace distributed
+
 constexpr char kOptimizeBlocks[] = "optimize_blocks";
 
 void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
index e63f882478351cde16bde969b86e020181d6d4e5..db8c2f3f2d8660b0390cb35d76b2ef800c631788 100644
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -12,18 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <nccl.h>
-#include <stdint.h>
 #include <ostream>
 #include <string>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index b41e4e87722f638e6661a5116ebdfbc02c32710f..bacfd32cc739193a18317cb7d57c16953a4b5c84 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -31,9 +31,25 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Executor;
+class ProgramDesc;
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCServer;
+class RequestHandler;
+}  // namespace distributed
+
 constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 constexpr char kCheckpointBlockId[] = "checkpint_block_id";
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e53ce8cc67c08269e15a20e2cd2fc57a2c5ace17
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h"
+
+#include <string>
+namespace paddle {
+namespace operators {
+
+class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of LargeScaleFuseAdamOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of LargeScaleFuseAdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+
+    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
+                      "Maybe the Input variable LearningRate has not "
+                      "been initialized. You may need to confirm "
+                      "if you put exe.run(startup_program) "
+                      "after optimizer.minimize function.");
+
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "LearningRate") {
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class LargeScaleFuseAdamOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto in_var_type = ctx->GetInputType("Grad");
+    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                          in_var_type == framework::proto::VarType::LOD_TENSOR,
+                      true, platform::errors::InvalidArgument(
+                                "The input Var's type should be LoDtensor or "
+                                "SelectedRows, but the received type is %s",
+                                in_var_type));
+  }
+};
+
+class LargeScaleFuseAdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Grad",
+             "(SelectedRows) Ids's type should be SelectedRows"
+             "THe ids to be looked up in W.");
+
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddAttr<bool>("is_entry",
+                  "(bool)"
+                  "sparse table need entry");
+
+    AddAttr<std::string>("tablename",
+                         "(string)"
+                         "sparse table name");
+
+    AddAttr<std::vector<std::string>>("value_names",
+                                      "(strings)"
+                                      "sparse table name");
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    lookup_sparse_table_fuse_adam, ops::LargeScaleFuseAdamOp,
+    ops::LargeScaleFuseAdamOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::LargeScaleFuseAdamOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    lookup_sparse_table_fuse_adam,
+    ops::LargeScaleFuseAdamOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..89b8d54a463b03076c9489b842540ea4a4f68a82
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <math.h>  // for sqrt in CPU and CUDA
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LargeScaleFuseAdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override;
+};
+
+template <typename T>
+class LargeScaleFuseAdamOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using paddle::framework::LoDTensor;
+
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    PADDLE_ENFORCE(
+        grad_var->IsType<framework::SelectedRows>(),
+        platform::errors::InvalidArgument(
+            "in large scale optimize, gradient should only be SelectedRows"));
+
+    const auto &grad = grad_var->Get<framework::SelectedRows>();
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad.rows().size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows tmp_grad_merge;
+    const framework::SelectedRows *grad_merge_ptr;
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
+               &tmp_grad_merge, true);
+    grad_merge_ptr = &tmp_grad_merge;
+
+    std::vector<int64_t> in_rows;
+    in_rows.reserve(grad_merge_ptr->rows().size());
+    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
+              std::back_inserter(in_rows));
+
+    const auto *lr = learning_rate->data<T>();
+    auto grad_v = grad_merge_ptr->value();
+    auto grad_width = grad_v.dims()[1];
+
+    //    auto is_entry = context.Attr<bool>("is_entry");
+    auto tablename = ctx.Attr<std::string>("tablename");
+    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
+
+    auto *beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto *beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+    auto *beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto *beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    // update beta1 and beta2
+    beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+        beta1 * beta1_pow->data<T>()[0];
+    beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+        beta2 * beta2_pow->data<T>()[0];
+
+    std::vector<std::vector<std::vector<float> *>> values;
+    std::vector<int64_t> dims;
+
+    auto *ins = distributed::LargeScaleKV::GetInstance();
+    auto *table = ins->Get(tablename);
+    table->Get(in_rows, value_names, &values);
+    table->Dims({"Param"}, &dims);
+
+    PADDLE_ENFORCE_EQ(dims[0], grad_width,
+                      platform::errors::InvalidArgument(
+                          "param_row should have the same size with grad_row"));
+
+    T lr_ = lr[0];
+    T beta1_pow_ = beta1_pow->data<T>()[0];
+    T beta2_pow_ = beta2_pow->data<T>()[0];
+
+    lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
+
+    for (size_t i = 0; i < in_rows.size(); i++) {
+      auto &params = values[i][0];
+      auto &moment_1 = values[i][1];
+      auto &moment_2 = values[i][2];
+
+      auto *p_data = params->data();
+      auto *m1_data = moment_1->data();
+      auto *m2_data = moment_2->data();
+
+      for (int x = 0; x < grad_width; ++x) {
+        auto g = grad_v.data<T>()[grad_width * i + x];
+        m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g;
+        m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g;
+        p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon));
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..010658b5280d7feeb683112b401dbcaaa265daac
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h"
+
+#include <string>
+namespace paddle {
+namespace operators {
+
+class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of LargeScaleFuseSGDOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of LargeScaleFuseSGDOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+
+    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
+                      "Maybe the Input variable LearningRate has not "
+                      "been initialized. You may need to confirm "
+                      "if you put exe.run(startup_program) "
+                      "after optimizer.minimize function.");
+
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "LearningRate") {
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class LargeScaleFuseSGDOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto in_var_type = ctx->GetInputType("Grad");
+    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                          in_var_type == framework::proto::VarType::LOD_TENSOR,
+                      true, platform::errors::InvalidArgument(
+                                "The input Var's type should be LoDtensor or "
+                                "SelectedRows, but the received type is %s",
+                                in_var_type));
+  }
+};
+
+class LargeScaleFuseSGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Grad",
+             "(SelectedRows) Ids's type should be SelectedRows"
+             "THe ids to be looked up in W.");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddAttr<bool>("is_entry",
+                  "(bool)"
+                  "sparse table need entry");
+
+    AddAttr<std::string>("tablename",
+                         "(string)"
+                         "sparse table name");
+
+    AddAttr<std::vector<std::string>>("value_names",
+                                      "(strings)"
+                                      "sparse table name");
+
+    AddComment(R"DOC(
+
+LargeScaleFuseSGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+$$param\_out = param - learning\_rate * grad$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    lookup_sparse_table_fuse_sgd, ops::LargeScaleFuseSGDOp,
+    ops::LargeScaleFuseSGDOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::LargeScaleFuseSGDOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    lookup_sparse_table_fuse_sgd,
+    ops::LargeScaleFuseSGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d4bf1015fa3a8c2c8fb102fcd890f41b296269d
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LargeScaleFuseSGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override;
+};
+
+template <typename T>
+class LargeScaleFuseSGDOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    PADDLE_ENFORCE(
+        grad_var->IsType<framework::SelectedRows>(),
+        platform::errors::InvalidArgument(
+            "in large scale optimize, gradient should only be SelectedRows"));
+
+    const auto &grad = grad_var->Get<framework::SelectedRows>();
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad.rows().size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows tmp_grad_merge;
+    const framework::SelectedRows *grad_merge_ptr;
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
+               &tmp_grad_merge, true);
+    grad_merge_ptr = &tmp_grad_merge;
+
+    std::vector<int64_t> in_rows;
+    in_rows.reserve(grad_merge_ptr->rows().size());
+    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
+              std::back_inserter(in_rows));
+
+    const auto *lr = learning_rate->data<T>();
+    auto grad_v = grad_merge_ptr->value();
+    auto grad_width = grad_v.dims()[1];
+
+    //    auto is_entry = context.Attr<bool>("is_entry");
+    auto tablename = ctx.Attr<std::string>("tablename");
+    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
+
+    std::vector<std::vector<std::vector<float> *>> values;
+    std::vector<int64_t> dims;
+
+    auto *ins = distributed::LargeScaleKV::GetInstance();
+    auto *table = ins->Get(tablename);
+    table->Get(in_rows, value_names, &values);
+    table->Dims({"Param"}, &dims);
+
+    PADDLE_ENFORCE_EQ(dims[0], grad_width,
+                      platform::errors::InvalidArgument(
+                          "param_row should have the same size with grad_row"));
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+
+    std::vector<T> grads;
+    framework::TensorToVector(grad_v, ctx.device_context(), &grads);
+
+    blas.SCAL(grads.size(), lr[0], grads.data());
+
+    for (int x = 0; x < static_cast<int>(in_rows.size()); ++x) {
+      auto &params = values[x][0];
+      blas.VSUB(grad_width, params->data(), grads.data() + grad_width * x,
+                params->data());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
index 6037ab1523ec347574db4b18aa1de732acbd3336..007dbbbfbf5105b052598c9454dfcdc15319d658 100644
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
@@ -12,18 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class PrefetchOp : public framework::OperatorBase {
  public:
   PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 15b36baeada300e1ab472737b4e35538f9882cb7..9729d0dadd7ed8e68056e7c87437944425efdfe0 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -12,22 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class RecvOp : public framework::OperatorBase {
  public:
   RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -37,12 +45,6 @@ class RecvOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    int do_not_run = Attr<int>("do_not_run");
-    if (do_not_run) {
-      VLOG(3) << "recv do not run!";
-      return;
-    }
-
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
     std::vector<std::string> varnames =
         Attr<std::vector<std::string>>("varnames");
@@ -63,11 +65,10 @@ class RecvOp : public framework::OperatorBase {
     if (recv_varnames.size() > 0) {
       auto *communicator = distributed::Communicator::GetInstance();
 
-      if (communicator == nullptr) {
+      if (communicator != nullptr) {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "need run fleet.init_worker first"));
+            "execute startup program must before fleet.init_worker"));
       }
-      communicator->RecvNoBarrier();
     } else {
       std::vector<distributed::VarHandlePtr> rets;
       if (with_barrier) {
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
index befdf4e938850cad4180d3d7a66cc67ea1b2e810..4727b3bb249de8a49b6955d581109e6eee3e3f8b 100644
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
@@ -13,8 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
index 43dd9c3c98a0ab454a8114d8cdf58606db0684a4..d8639627c3ef66d3d5a9be86a1c6ada640efc533 100644
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
index a8e9379d214d8f85fd9847fa79046275fe595a18..5aa2ba26aa4d6eef73e52ea64041d57c24ec47c1 100644
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@@ -12,20 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 
-#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class SendBarrierOp : public framework::OperatorBase {
  public:
   SendBarrierOp(const std::string& type,
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 53e3d70f960938bed77cba4112e22692dd7ed87b..a4192c18afae50033b40490e5a831a184caaa1b3 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -12,23 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class SendOp : public framework::OperatorBase {
  public:
   SendOp(const std::string& type, const framework::VariableNameMap& inputs,
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
index c05a1ff1da8803c1ef3161d0e9d8604f9f1e5f3b..7dc0596ac31e2506ae02de11b33bd0532f02cc7a 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -34,16 +34,16 @@ inline bool NeedSend(const framework::Scope& scope,
       std::string::npos)
     return false;
   auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound(
+               "Can not find variable '%s' in the send side.", varname));
   if (var->IsType<framework::LoDTensor>()) {
     return var->Get<framework::LoDTensor>().IsInitialized();
   } else if (var->IsType<framework::SelectedRows>()) {
     return var->Get<framework::SelectedRows>().rows().size() > 0UL;
   } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Variable type in send side should be LodTensor or SelectedRows."));
   }
   return false;
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 97624944ca109f27322f151f0742c72447fd5c39..68a98e7c6bc2a7cd405a6d1bc4448c0f084d0f0f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -20,6 +20,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 152f20273a1820e709f562d2c72b6cbc7e29dfb0..c4efc4ab72d6308d89220ef9e49ad2240b2ddef2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 5a398fa50febe2efffd588ce8f3612f1f9cec0b6..ddd69203fd316b41cd4b2b3077df6f4ac12f1a64 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -13,9 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index 692bc015c5b6fc47502b37412a2be2d90237c672..be6a63305475e9e44bcb0499b6438bdc3354677a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -13,10 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 1b2364a5a53d1817d18dccfe7363f2e37be5b1b7..bd40763e05a281e4042c3193b8fb763b9aea8909 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -13,10 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index 1a49a6013987ae1ec685ec91ca656e4756ba7c32..5a3e7f90f3c3dbee093c17fd4c5cf863ad1f4d24 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index 8c2e62bed195f27e228d5dd460ba21ed87c3f5d2..d8ad0a353c9cbebe7b40e70e5e2f293f38931376 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -13,9 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class ElementwiseModOpMaker : public ElementwiseOpMaker {
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
index 1321eee8457173cd310b8ff66092386641d631fc..ea0e8e7c0138770fb45b12411f9acac9ce97777e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
@@ -10,10 +10,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
index a910c326196bc61758c3be7db3b8ac5d85b0095c..535d838209d0e418554d7f6facc932af6fff6185 100755
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <type_traits>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 9603b022d5d8c4672219b7199d1bfe4fb6bfe3e9..90f4ebb99ec7d7be09ab4129fd89391be1037b1c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -13,10 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index caaaf2c931de04e9e489130eae0491d450cd2dab..3dcf5bf6a32fadef88df647d898e237a6dcbe34c 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -14,6 +14,16 @@
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
index be8b8d6c2f7c0d70c997b4759ffb6a6943491192..15c31a4cece5c538e35f25565a85a9b6b32f5c30 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -12,20 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
 USE_OP(elementwise_add);
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index 6ec8f2c2355ee098aed4a6b92410bcc60bca4736..cf9e9dbb04b03902ae9f96809d084b9126f15f94 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -12,14 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/enqueue_op.cc b/paddle/fluid/operators/enqueue_op.cc
index 9b367a72fb5f4c46366323497bbf6ca0cc167cbb..a7920488f2e05618bddb302f38f391ae4eb838bd 100644
--- a/paddle/fluid/operators/enqueue_op.cc
+++ b/paddle/fluid/operators/enqueue_op.cc
@@ -13,13 +13,24 @@
 // limitations under the License.
 
 #include <string>
-#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 using LoDTensor = paddle::framework::LoDTensor;
 using LoDTensorBlockingQueueHolder =
     paddle::operators::reader::LoDTensorBlockingQueueHolder;
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
index 0eefe7d2163bb967596480f2427b995a6a87ff6e..d5ad27596d6ba399a88059cb18d83933e8171ea2 100644
--- a/paddle/fluid/operators/eye_op.h
+++ b/paddle/fluid/operators/eye_op.h
@@ -51,7 +51,7 @@ class EyeKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     set_zero(dev_ctx, out_tensor, static_cast<T>(0));
 
-    int64_t num_eyes = std::min(num_rows, num_columns);
+    int64_t num_eyes = (std::min)(num_rows, num_columns);
     platform::ForRange<DeviceContext> for_range(dev_ctx, num_eyes);
     EyeFunctor<T> functor(num_columns, out_data);
     for_range(functor);
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 3fc5f3bfc6b1633ffe835606bbac6118e6b32ca6..477a9162fe3f779d4006deb2e20b3a16f70cdf47 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -8,7 +8,8 @@ register_operators(EXCLUDES
     multihead_matmul_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
-    fusion_gru_op)
+    fusion_gru_op
+    fused_bn_add_activation_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -47,4 +48,9 @@ if (WITH_GPU)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
         cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
     endif()
+    # fused_bn_add_activation
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
+    op_library(fused_bn_add_activation_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index b22f28fbbe3ce8ce178a3d9c17a048817cb750e7..49fded886a0339a0456ee55d0d4d1249461f93b9 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -204,6 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto x_dims = framework::vectorize(transformed_input.dims());
     auto f_dims = framework::vectorize(filter->dims());
     if (!exhaustive_search) {
+#if CUDNN_VERSION >= 8000
       int perf_count;
       int best_algo_idx = 0;
       size_t tmp_size = 0;
@@ -215,13 +216,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-      VLOG(3) << "cuDNN forward algo " << algo;
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, algo, &workspace_size_in_bytes));
       if (workspace_size_in_bytes > workspace_size_limit)
         workspace_size_limit = workspace_size_in_bytes;
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+              handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+              cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &algo));
+      VLOG(3) << "cuDNN forward algo " << algo;
+#endif
     } else {
       std::function<cudnnConvolutionFwdAlgo_t()> search_func =
           [&]() -> cudnnConvolutionFwdAlgo_t {
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b3ed03bb6419cd3c36f6ee2e856f1816d314c75
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+void FusedBatchNormAddActOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias",
+                 "FusedBatchNormAddActOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
+                 "FusedBatchNormAddActOp");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const auto z_dims = ctx->GetInputDim("Z");
+  PADDLE_ENFORCE_EQ(x_dims, z_dims,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the shapes of input "
+                        "must be equal. But received: the shape "
+                        "of input X = [%s], and the shape of "
+                        "input Y = [%s]",
+                        x_dims, z_dims));
+  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must greater than or equal to 2."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must smaller than or equal to 5."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+
+  const int64_t C = x_dims[x_dims.size() - 1];
+
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
+
+  PADDLE_ENFORCE_EQ(
+      scale_dim.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim, scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim, bias_dim.size()));
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C, scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C, bias_dim[0]));
+  }
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should be float when input tensor's dtype is float16.
+  auto bn_param_type = framework::proto::VarType::FP32;
+
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+      platform::errors::InvalidArgument("Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+      platform::errors::InvalidArgument("Bias input should be of float type"));
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+void FusedBatchNormAddActOpMaker::Make() {
+  AddInput("X", "The input tensor");
+  AddInput("Z", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel");
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be between 0.0 and 0.001."));
+      });
+  AddAttr<std::string>("act_type", "The activation type to be fused.")
+      .SetDefault("relu");
+  AddComment(R"DOC(
+Fused Batch Normalization with activation.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
+Now, the required data format for FusedBatchNormAddActOp is NHWC `[batch, in_height, in_width, in_channels]`.
+
+)DOC");
+}
+
+void FusedBatchNormAddActGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  // check input
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
+                 framework::GradVarName("Y"), "FusedBatchNormAddActGradOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                 framework::GradVarName("X"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output",
+                 framework::GradVarName("Z"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale")), "Output",
+                 framework::GradVarName("Scale"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output",
+                 framework::GradVarName("Bias"), "FusedBatchNormAddActGradOp");
+
+  const auto in_dims = ctx->GetInputDim("X");
+  const int C = in_dims[in_dims.size() - 1];
+
+  ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Z"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+  ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+}
+
+framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW(platform::errors::NotFound(
+        "Can not find Y@GRAD in the execution context."));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+  }
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bn_add_activation, ops::FusedBatchNormAddActOp,
+    ops::FusedBatchNormAddActOpMaker, ops::FusedBatchNormAddActOpInferVarType,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bn_add_activation_grad,
+                  ops::FusedBatchNormAddActGradOp);
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f1d297cda3fae54cdde089f25ccdf6715142c5f
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -0,0 +1,338 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    float momentum = ctx.Attr<float>("momentum");
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    // Get the size for each dimension.
+    // NHWC [batch_size, in_height, in_width, in_channels]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto &in_dims = x->dims();
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // ------------------- cudnn descriptors ---------------------
+    auto handle = dev_ctx.cudnn_handle();
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    double this_factor = 1. - momentum;
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    size_t workspace_size = 0;
+    size_t reserve_space_size = 0;
+    void *reserve_space_ptr = nullptr;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    // Create reserve space and workspace for batch norm.
+    // Create tensor for each batchnorm op, it will be used in the
+    // backward. Thus this tensor shouldn't be temp.
+    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    PADDLE_ENFORCE_NOT_NULL(
+        reserve_space,
+        platform::errors::NotFound(
+            "The argument ReserveSpace of batch_norm op is not found."));
+
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::
+            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                /*handle=*/handle,
+                /*mode=*/mode_,
+                /*bnOps=*/bnOps_,
+                /*xDesc=*/data_desc_,
+                /*zDesc=*/data_desc_,
+                /*yDesc=*/data_desc_,
+                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                /*activationDesc=*/activation_desc_,
+                /*sizeInBytes=*/&workspace_size));
+
+    // -------------- cudnn batchnorm reserve space --------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+            /*handle=*/handle,
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*activationDesc=*/activation_desc_,
+            /*xDesc=*/data_desc_,
+            /*sizeInBytes=*/&reserve_space_size));
+
+    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
+                                                    reserve_space_size);
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+            handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+            data_desc_, z->template data<T>(), data_desc_,
+            y->template data<T>(), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
+            reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+template <typename T>
+class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto *y = ctx.Input<Tensor>("Y");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    const auto &in_dims = x->dims();
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_z = ctx.Output<Tensor>(framework::GradVarName("Z"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_z->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        d_scale && d_bias, true,
+        platform::errors::PreconditionNotMet(
+            "Both the scale grad and the bias grad must not be null."));
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL,
+                      platform::errors::PreconditionNotMet(
+                          "The scale only has one dimension."));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::PreconditionNotMet(
+            "The size of scale is equal to the channel of Input(X)."));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();
+
+    size_t workspace_size = 0;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    auto reserve_space_size = reserve_space->memory_size();
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*xDesc=*/data_desc_,
+            /*yDesc=*/data_desc_,
+            /*dyDesc=*/data_desc_,
+            /*dzDesc=*/data_desc_,
+            /*dxDesc=*/data_desc_,
+            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+            /*activationDesc=*/activation_desc_,
+            /*sizeInBytes=*/&workspace_size));
+
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationBackwardEx(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+            /*xDesc=*/data_desc_,
+            /*xData=*/x->template data<T>(),
+            /*yDesc=*/data_desc_,
+            /*yData=*/y->template data<T>(),
+            /*dyDesc=*/data_desc_,
+            /*dyData=*/d_y->template data<T>(),
+            /*dzDesc=*/data_desc_,
+            /*dzData=*/d_z->template data<T>(),
+            /*dxDesc=*/data_desc_,
+            /*dxData=*/d_x->template data<T>(),
+            /*dBnScaleBiasDesc=*/bn_param_desc_,
+            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
+            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
+            /*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
+            /*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
+            /*epsilon=*/epsilon,
+            /*savedMean=*/saved_mean_data,
+            /*savedInvVariance=*/saved_var_data,
+            /*activationDesmc=*/activation_desc_,
+            /*workspace=*/workspace_ptr,
+            /*workSpaceSizeInBytes=*/workspace_size,
+            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7401
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    fused_bn_add_activation,
+    ops::FusedBatchNormAddActKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(fused_bn_add_activation_grad,
+                        ops::FusedBatchNormAddActGradKernel<
+                            plat::CUDADeviceContext, plat::float16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c7df96e60dd89b74058ead837bb75555f3674ad
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+template <typename T>
+class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Z", this->Input("Z"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  }
+};
+
+class FusedBatchNormAddActOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+    return m;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 17cb4556d45ef3adee2adc0d2f19ea048e096982..37a442a78157166764b786215d97e11d661df894 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
-#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index f6b156eb30dae154395b34dcfc26319cd89edbca..ea1fa813928daed1f661834c766e8cb2a362986f 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <iostream>
-#include <string>
 
-#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 93f9e108723fbd56e0d3bf5d439614c2c20bb393..f0903bdfce9206981b13b9a1b61ca3eedf3195a0 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -15,6 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index ba918b3def22e3c60c4155f77ecbaad85d520928..bdc5debaea790c740f2e133b66e2cfb9e334dc3e 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
 
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 712ef05d8631ac74b92795321202cb5590286e82..4865a02c5292ffb9d079d0711f0bf7d6e927c441 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -47,7 +47,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluFunctor<T>()(d, x, y);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
@@ -137,7 +139,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index b6017a6eafc03921883e9427732cf0c2c769bcab..47b480c11c28f265e74c2ef6c103e71f642eed2f 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -13,8 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/hash_op.h"
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
index c2d530004912287b0720ab5d00da90c4e1b5cbc7..b5b3f3de70c280c7c135ad55793b1b64c608233d 100644
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
@@ -18,6 +18,7 @@ extern "C" {
 #include <xxhash.h>
 }
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index b4f33dad927222869af3b99bee1368699982c1a4..e8edfb99f9f306d7057afcdf935cad5a5e4a73d6 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -13,9 +13,23 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/increment_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index a5b270c1dfef14bc92697c29bfeafa0fe08211d7..03279a9b2c15b8d918333fd61c07ed636f11d889 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
     //          np.sum(dy,
     //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var
-    //          *
+    //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
+    //          inv_var *
     //          np.mean(dy, axis=(h,w)) -
     //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w))))
+    //          axis=(h,w)))
 
     Tensor x_sub_mean_mul_invstd;
     x_sub_mean_mul_invstd.Resize({sample_size, NxC});
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 1e99e22e12b2a23685dad742f175fd2b0684d334..e8a9ed878e9bd502b9bd7e7d82f574fb5740bb5d 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -104,12 +104,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
   auto dim_x = ctx->GetInputDim("X");
   auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
 
-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method ||
-          "bicubic" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4, but got method = %s .",
-      interp_method);
+  PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
+                        "bicubic" == interp_method,
+                    true, platform::errors::InvalidArgument(
+                              "Interpolation method can only be \"bilinear\" "
+                              "or \"nearest\" or \"bicubic\" when "
+                              "Input(X) dimension is 4, but got method is %s.",
+                              interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
@@ -169,13 +170,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(), 1,
-        platform::errors::InvalidArgument(
-            "OutSize's dimension size must be 1, but got dimension = %d .",
-            out_size_dim.size()));
+        platform::errors::InvalidArgument("OutSize's dimension size must be 1, "
+                                          "but got dimension size is %d .",
+                                          out_size_dim.size()));
     PADDLE_ENFORCE_EQ(
         out_size_dim[0], 2,
         platform::errors::InvalidArgument(
-            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            "OutSize's dimension[0] must be 2, but got dimension[0] is %d .",
             out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
@@ -264,12 +265,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1, but got size =%d .",
-                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got size is %d.",
+            out_size_dim.size()));
     PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
-                      "OutSize's dim[0] must be 3, but got size = %d .",
-                      out_size_dim[0]);
+                      platform::errors::InvalidArgument(
+                          "OutSize's dim[0] must be 3, but got size is %d.",
+                          out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
@@ -289,10 +293,8 @@ class InterpolateOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of InterpolateOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of InterpolationOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Interpolate");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Interpolate");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
     PADDLE_ENFORCE(
@@ -534,9 +536,10 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InterpolateGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "InterpolateGrad");
+
     auto dim_x = ctx->GetInputDim("X");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 9b92ce3e538aa660dedda67de0cabaa4adbdc8c7..1ac1c26796cf360e40e56bbe24d0217ae9d4cc36 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -13,8 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/isfinite_op.h"
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 2fc0d58669bae428d811c7200e025f36f087b905..a54134910d0b86eac19ec2d1217cbcc2dab23212 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -15,12 +15,19 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 72da43e3bc63c1c585fe19d703892c23ce7b0ec2..fcbb4c5bf6a0b32e403672a1f40b4c8141bc84e3 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -13,10 +13,31 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/isfinite_v2_op.h"
+
 #include <string>
-#include <vector>
+
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
-#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace operators {
+template <typename DeviceContext, typename T, typename Functor>
+class OverflowKernel;
+}  // namespace operators
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/fluid/operators/isfinite_v2_op.h
index 9f0aa63ce80248ee9f7839890f611b9d5293789e..332c50d75513f80cc2b042e513f36c13a3244ccf 100644
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/fluid/operators/isfinite_v2_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -22,6 +23,12 @@
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 898f27f9afef9ca13a9f24ab1b61a50f745d40f7..d65cdc6c150ec6b9e5e4ed3e469069b3beffc819 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
 }
 
 using Tensor = paddle::framework::Tensor;
-
 template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYZN() {
   using T = typename KernelTuple::data_type;
@@ -320,8 +319,15 @@ void BenchKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    PADDLE_ENFORCE_LE(
+        static_cast<size_t>(upper - lower), n - 1,
+        paddle::platform::errors::InvalidArgument(
+            "The range of Sgd (upper - lower) should be equal to or lower "
+            "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
+            static_cast<size_t>(upper - lower), (n - 1)));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h
index 585196e939c3d2bb30d33189411c06e459d655fe..cd360caa39b4da483cdf50abab49d8658a309f0d 100644
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
index ff4a13a367994c1d08c4681bc64da4fcbfe143d7..d3d9eddd2ee78f61652ab392decb75ca0bf8f5d9 100644
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
index 331a4b0d0753b37843c3d112256abfbabe9a4913..c549fec0970cb235bed77105c4297669c163c5e7 100644
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -13,10 +13,11 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/embseqpool.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
 #include <vector>
-#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -131,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    PADDLE_ENFORCE_GT(attr.table_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute table_height of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.table_height));
+    PADDLE_ENFORCE_GT(attr.table_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute table_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.table_width));
+    PADDLE_ENFORCE_GT(attr.index_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_height of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_height));
+    PADDLE_ENFORCE_GT(attr.index_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_width));
+    PADDLE_ENFORCE_GT(attr.out_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute out_width of EmbSeqPool should be "
+                          "larger than 0. But it is %d.",
+                          attr.out_width));
     return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.h b/paddle/fluid/operators/jit/gen/embseqpool.h
index 67a393509523da5e4e34a233d501cc5bc0d11cd8..8353e5846f7811d00b492a6708c68d62a7c990ce 100644
--- a/paddle/fluid/operators/jit/gen/embseqpool.h
+++ b/paddle/fluid/operators/jit/gen/embseqpool.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc
index b5b0cffa80612c61829766027013f172962b5069..fbdf49d5d585f718bef608abb8eed12a34b90897 100644
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/gru.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h
index e047a65cb4b1e3ef7fd2368c5b94c360a79eb666..588d11820b20e92091d391303d61dc6a0a8bf968 100644
--- a/paddle/fluid/operators/jit/gen/gru.h
+++ b/paddle/fluid/operators/jit/gen/gru.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/act.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
index 575dec68a582deb15ae240ae8b4d6cd409403649..113c66a2558aac061062ee7f477b613fd70b8251 100644
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc
index 2c3bc985e9a8b224835d848d30e0a3ef641ed2f9..211dfc5ecad3cec1f9269c6298912c57483f5c80 100644
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/lstm.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h
index 6f232906569fd0f0783f1a1a16421eefdd36f3c2..c980670d482c48641491c094a92602438c31ff55 100644
--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ b/paddle/fluid/operators/jit/gen/lstm.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/act.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
index 3a455334f588da40cc39a153414658a8a68c9f38..3139b252cadbc37d6ffbe2af023bd5e836f15ab7 100644
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -13,9 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/matmul.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
-#include <vector>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -28,7 +29,11 @@ void MatMulJitCode::genCode() {
   preCode();
   int block, rest;
   const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
+  PADDLE_ENFORCE_GT(
+      groups.front(), 0,
+      platform::errors::InvalidArgument("The number of rest registers should "
+                                        "be larger than 0. But it is %d.",
+                                        groups.front()));
 
   const int block_len = sizeof(float) * block;
   const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
@@ -117,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
+    PADDLE_ENFORCE_GT(
+        attr.m, 0, platform::errors::InvalidArgument(
+                       "The attribute m (first matrix's row) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.m));
+    PADDLE_ENFORCE_GT(
+        attr.n, 0, platform::errors::InvalidArgument(
+                       "The attribute n (first matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.n));
+    PADDLE_ENFORCE_GT(
+        attr.k, 0, platform::errors::InvalidArgument(
+                       "The attribute k (second matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.k));
     return make_unique<MatMulJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index 881cea581acc27a7aa7d395c041d40a4d3281947..eb7328d7e069cf05a22ec1ecee70f36280e6d231 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -17,6 +17,7 @@
 #include <stdlib.h>  // for malloc and free
 #include <string>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -32,7 +33,10 @@ class MatMulJitCode : public JitCode {
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
       : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
+                                 "Jitcode of matmul only support m==1 (first "
+                                 "matrix's row) now. But m is %d.",
+                                 m_));
     this->genCode();
   }
 
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
index ec8e4e9827441bc0a817c6da455cb9e530c8c1bf..d8c7b3cdb7b1f36125b76feab19ab4369d491219 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
+    PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
+                                     "The attribute width of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.w));
+    PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
+                                     "The attribute height of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.h));
     return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index a0f1bb8299ab8a087150540945a1e49bac8d301d..d4e7b2e29ce22705c0ef7320495f55483d9bfef1 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -126,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
         vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
         reg_idx++;
       }
-      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
-                        "All heights should use same regs");
+      PADDLE_ENFORCE_EQ(
+          reg_idx, rest_used_num_regs,
+          platform::errors::InvalidArgument(
+              "All heights of SeqPool should use the same number of registers."
+              "It equals to the numbr of rest registers. But use %d registers "
+              "and the numbr of rest registers is %d.",
+              reg_idx, rest_used_num_regs));
       for (int i = 0; i < reg_idx; ++i) {
         vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
       }
diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc
index 40f8298af39826ef3823e837f57c8e4e6209cd53..7fe93fdb6a51a811a6e60ba5af31d9a91aadd336 100644
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -13,9 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/sgd.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
-#include <vector>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
@@ -115,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
   size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
   std::unique_ptr<GenBase> CreateJitCode(
       const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width,
+                      platform::errors::InvalidArgument(
+                          "The attribute param_width of Sgd should be "
+                          "equal to the attribute grad_width. But param_width "
+                          "is %d and grad_width is %d.",
+                          attr.param_width, attr.grad_width));
+    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height,
+                      platform::errors::InvalidArgument(
+                          "The attribute selected_rows_size of Sgd should be "
+                          "equal to or less than the attribute grad_height. "
+                          "But selected_rows_size is %d and grad_height is %d.",
+                          attr.selected_rows_size, attr.grad_height));
+    PADDLE_ENFORCE_GE(
+        attr.selected_rows_size, 0,
+        platform::errors::InvalidArgument(
+            "The attribute selected_rows_size of Sgd should be "
+            "equal to or larger than 0. But selected_rows_size is %d.",
+            attr.selected_rows_size));
     return make_unique<SgdJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/sgd.h b/paddle/fluid/operators/jit/gen/sgd.h
index 80b1809bbbfd32d1d12a5f6bddd45a51026bdb57..403d97b8fec7c44b9ed39f043bbad4349b5fe1d7 100644
--- a/paddle/fluid/operators/jit/gen/sgd.h
+++ b/paddle/fluid/operators/jit/gen/sgd.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc
index 66a8d75fd4de5bae3ba37cf7fe7b1645938aa855..4084d68c2a840812358ec13f33d99fbb1f592c9f 100644
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
     return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
   }
   std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
+    PADDLE_ENFORCE_GT(
+        w, 0,
+        platform::errors::InvalidArgument(
+            "The width of VBroadcast should be larger than 0. But w is %d.",
+            w));
     return make_unique<VBroadcastJitCode>(w, CodeSize(w));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.h b/paddle/fluid/operators/jit/gen/vbroadcast.h
index 27c75f6f710e9514c7d91181e7f447d9dd997081..7d30fe5751bd462fcb5e0824e913d6f4f0b89a66 100644
--- a/paddle/fluid/operators/jit/gen/vbroadcast.h
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 4c49eff49e3efc0664a084f9fa2bb897db0c6f1d..2ae71256cddcb172edb24488d559fe788e99ada5 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
   void* ptr;
   constexpr size_t alignment = 32ul;
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
-                    "GenBase Alloc %ld error!", size);
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  PADDLE_ENFORCE_EQ(
+      posix_memalign(&ptr, alignment, size), 0,
+      platform::errors::InvalidArgument(
+          "Jitcode generator (GenBase) allocate %ld memory error!", size));
+  PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument(
+                                   "Fail to allocate jitcode generator "
+                                   "(GenBase) CPU memory: size = %d .",
+                                   size));
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index 033c603c07c288ba621ceaa912ea0c476fe86cd6..27b857634155c9b31768c5187f214f72b9a0112d 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -18,6 +18,7 @@
 #include <memory>  // for unique_ptr
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 DECLARE_bool(dump_jitcode);
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 2952cdb87146ec01a366abaf332ce1099c425966..c66e8092d5e4221767100c94174210af24a43abc 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kEmbSeqPool);
     ONE_CASE(kSgd);
     default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "JIT kernel do not support type: %d.", kt));
       return "NOT JITKernel";
   }
   return nullptr;
@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
     ONE_CASE(kAvg);
     ONE_CASE(kSqrt);
     default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "SeqPool JIT kernel do not support type: %d.", tp));
       return "NOT PoolType";
   }
   return nullptr;
@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
   } else if (lower == "tanh" || lower == "vtanh") {
     return kVTanh;
   }
-  PADDLE_THROW("Not support type: %s, or forget to add this case", act);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", act));
   return kNone;
 }
 
@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
   int block, rest;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+    PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
+                                "Each element of groups should be larger than "
+                                "0. However the element: %d doesn't satify.",
+                                i));
   });
   int sum = std::accumulate(groups.begin(), groups.end(), 0);
   std::memset(dst, 0, k * sum * block * sizeof(float));
   PADDLE_ENFORCE_GE(sum * block, n,
-                    "The packed n should be equal to or larger than n");
+                    platform::errors::InvalidArgument(
+                        "The packed n (sum * block) should be equal to or "
+                        "larger than n (matmul row size). "
+                        "However, the packed n is %d and n is %d.",
+                        sum * block, n));
 
   const int block_len = sizeof(float) * block;
   int n_offset = 0;
@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
 template <typename T>
 typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
     const T* src, T* dst, int n, int k) {
-  PADDLE_THROW("Only support pack with float type.");
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Only supports pack weights with float type."));
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index 39e5ee2be1538e7884118a2280ed2e02c5de3bff..0791bb5810526cb930fe1869a60913d4239f72a3 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -21,6 +21,7 @@
 #include <unordered_map>
 #include <utility>  // for std::move
 #include <vector>
+
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_key.h"
@@ -31,6 +32,8 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
+class GenBase;
+
 template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
     std::is_same<typename KernelTuple::data_type, float>::value &&
@@ -82,8 +85,10 @@ inline const Kernel* GetReferKernel() {
   auto& ref_pool = ReferKernelPool::Instance().AllKernels();
   KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
   auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
+  PADDLE_ENFORCE_NE(
+      ref_iter, ref_pool.end(),
+      platform::errors::PreconditionNotMet(
+          "Every Refer Kernel of jitcode should have reference function."));
   auto& ref_impls = ref_iter->second;
   for (auto& impl : ref_impls) {
     auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
@@ -98,7 +103,9 @@ template <typename KernelTuple>
 inline typename KernelTuple::func_type GetReferFunc() {
   auto ker = GetReferKernel<KernelTuple>();
   auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::InvalidArgument(
+                                 "Get the reference code of kernel in CPU "
+                                 "failed. The Refer kernel should exsit."));
   return p->GetFunc();
 }
 
@@ -129,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
 
   // The last implementation should be reference function on CPUPlace.
   auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  PADDLE_ENFORCE_NOT_NULL(ref, platform::errors::InvalidArgument(
+                                   "Get all candicate kernel in CPU failed. "
+                                   "The Refer Kernel can not be empty."));
   res.emplace_back(ref);
   return res;
 }
@@ -144,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
     std::string name = k->ImplType();
     if (name == "JitCode") {
       auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i,
+                              platform::errors::InvalidArgument(
+                                  "Generate jitcode kernel (GenBase) failed."));
       res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
     } else {
       auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i, platform::errors::InvalidArgument(
+                                     "Kernel cast (KernelMore) failed."));
       res.emplace_back(std::make_pair(name, i->GetFunc()));
     }
   }
@@ -170,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
 typename KernelTuple::func_type GetDefaultBestFunc(
     const typename KernelTuple::attr_type& attr) {
   auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  PADDLE_ENFORCE_GE(funcs.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "The candicate jit kernel is at least one in CPU."));
   // Here could do some runtime benchmark of this attr and return the best one.
   // But yet just get the first one as the default best one,
   // which is searched in order and tuned by offline.
diff --git a/paddle/fluid/operators/jit/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc
index f1719be987348b6b8f0ba5c1212808f20c922c78..7b6b13c3d989291176640066b16e9dbb340f8805 100644
--- a/paddle/fluid/operators/jit/kernel_pool.cc
+++ b/paddle/fluid/operators/jit/kernel_pool.cc
@@ -13,9 +13,6 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernel_pool.h"
-#include <memory>  // for shared_ptr
-#include <string>
-#include <unordered_map>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h
index 48435cf6ef25448b548836e5da581609c63dd08f..763be7d3cf51f622e5ac4f654b83c9e82d1f458d 100644
--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ b/paddle/fluid/operators/jit/kernel_pool.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>  // for move
 #include <vector>
+
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_key.h"
@@ -29,6 +30,8 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
+struct KernelKey;
+
 extern std::map<size_t, std::shared_ptr<void>>& GetJITCodesMap();
 
 template <KernelType KT>
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
index 49b1a1fea4b16f435120bb37c7d9c8c07a4cc4f5..6e0c972e27ac7b981111db03c9e8ba60e75a6c23 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
index 7b9f676050d806314edd1e46611416a8b7170add..6a44bb256129487ac194798f959c32c578e49e82 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index f5b7bfff89825bfcd6cbe4b1008628d3e1093f4c..5d63f4848e6165bfb84c1bfe301d20cc24cfc7b0 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
   } else if (type == kVIdentity) {
     return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
   }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s", type));
   return nullptr;
 }
 
diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h
index 035425317edca95bc574807fa029ff373a7e10b8..240cbbcda4fbd0dc2a99b300b0a4c91dbb115b28 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index b38cc107b8e3038e04db4ed809d647e9a20d45fc..5f3c29ad5efb848f1fa12236ffe36a9f654864a3 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -17,6 +17,7 @@
 #include <cmath>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -102,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d, "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    PADDLE_ENFORCE_LT(
+        idx[i], attr->table_height,
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -167,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
          T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
   T scalar = -lr[0];
   int width = attr->grad_width;
   if (out == param) {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
       VAXPY(scalar, grad + i * width, out + h_idx * width, width);
     }
   } else {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
       VScal(&scalar, grad + i * width, out + h_idx * width, width);
       VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
            width);
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 136b99e0aeffec8e93e11c2e5e4f7bd35dd1c8d4..42fb7b4f279c225fb38a49d23e9d76ac1854d12d 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -17,6 +17,7 @@
 #include <cmath>
 #include <limits>
 #include <string>
+
 #include "paddle/fluid/operators/jit/helper.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -146,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
   } else if (type == kVIdentity) {
     return VIdentity<T>;
   }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", type));
   return nullptr;
 }
 
@@ -464,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d and "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
 
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    PADDLE_ENFORCE_LT(
+        idx[i], attr->table_height,
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -504,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
          T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
   for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
     auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
+    PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                      platform::errors::InvalidArgument(
+                          "The rows of Sgd should be "
+                          "less than the attribute. But %dth of rows "
+                          "is %d and grad_width is %d.",
+                          i, h_idx, attr->param_height));
+    PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                    "The rows of Sgd should be "
+                                    "larger than 0. But %dth of rows "
+                                    "is %d.",
+                                    i, h_idx));
     for (int64_t j = 0; j < attr->grad_width; ++j) {
       out[h_idx * attr->grad_width + j] =
           param[h_idx * attr->grad_width + j] -
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index eb56f111f0880f1a884e8f7f7ca2edcebfac695a..0cc62720b87943c8d92e56a53705ec9e4b46e047 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -850,8 +850,15 @@ void TestKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1,
+                      paddle::platform::errors::InvalidArgument(
+                          "The range of Sgd (upper - lower) should be lower "
+                          "than n-1 (Sgd size -1). But the upper - lower is %d "
+                          "and n-1 is %d.",
+                          static_cast<size_t>(upper - lower), n - 1));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index ac0405b9a6e64a3ea53203533ae4e6787a24c2fe..231ff941278c72ce6bb2bf07315be25a32f5bc37 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,9 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/label_smooth_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index 6968c1a5b131211a8fc7a474df8d1692d6a5ed0f..931cd6d1794910cfa26fe03f713001c426736fad 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
@@ -27,6 +28,14 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/operators/math/math_function.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+class CUDADeviceContext;
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 667c6e892956e29478f1401c3cb2622713433037..7cc07383bfa5f67a2404b220cb481d9017b40fd8 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/linspace_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -21,7 +22,7 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
@@ -50,11 +51,17 @@ class LinspaceOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };
 
 class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
index 49e8cbbbaabe75987196f27d1713be389233373f..70da0149cadc46c602a02b9511705d6d82cfbd13 100644
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 7cbfbd03e1dcb4983863445f6a9cd2c9ee17a8b0..a9128b15bdbad28e9d24891ed96c073a82b8f4bf 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -11,8 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index b130e84933bc9a26653b5eb164ccc450fdb7b63e..db07a104a89cdb669bd39ff42e5df6acfc44f4bf 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -11,14 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <algorithm>
-#include <map>
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 8bad10d7235b29c7769e2468c2108d287cc755cd..e95aef8eb563fba75dce4dec8b334dfb49e42968 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -28,34 +28,54 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
 void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "X(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Y(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "W(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                    "Out(Output) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Tmp"), true,
-                    "Tmp(Output) of MatchMatrix should not be null.");
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasOutput("Tmp"), "Output", "Tmp", "match_matrix_tensor");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of Input(X) can't be less than 2.");
+                    platform::errors::InvalidArgument(
+                        "The dimensions of Input(X) should be equal to 2, "
+                        "but received %d.",
+                        x_dims.size()));
 
   auto y_dims = ctx->GetInputDim("Y");
   PADDLE_ENFORCE_EQ(y_dims.size(), 2,
-                    "The rank of Input(Y) can't be less than 2.");
+                    platform::errors::InvalidArgument(
+                        "The dimensions of Input(Y) should be equal to 2, "
+                        "but received %d.",
+                        y_dims.size()));
 
   auto w_dims = ctx->GetInputDim("W");
-  PADDLE_ENFORCE_EQ(w_dims.size(), 3UL, "W should be 3-D tensor");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 3,
+                    platform::errors::InvalidArgument(
+                        "The dimensions of Input(W) should be equal to 3, "
+                        "but received %d.",
+                        w_dims.size()));
 
   int dim_t = ctx->Attrs().Get<int>("dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[0], x_dims[1],
-                    "W 's shape must satisfy: W[0] = X[1]");
-  PADDLE_ENFORCE_EQ(w_dims[1], dim_t, "W 's shape must satisfy: W[1] = dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[2], y_dims[1],
-                    "W 's shape must satisfy: W[2] = Y[1]");
+  PADDLE_ENFORCE_EQ(
+      w_dims[0], x_dims[1],
+      platform::errors::InvalidArgument(
+          "The first dimension of Input(W) should be equal to the second "
+          "dimension of Input(X). But received the first dimension of Input(W) "
+          "is %d, the second dimension of Input(X) is %d.",
+          w_dims[0], x_dims[1]));
+  PADDLE_ENFORCE_EQ(
+      w_dims[1], dim_t,
+      platform::errors::InvalidArgument(
+          "The second dimension of Input(W) should be equal to 'dim_t', but "
+          "received the second dimension of Input(W) is %d, 'dim_t' is %d.",
+          w_dims[1], dim_t));
+  PADDLE_ENFORCE_EQ(
+      w_dims[2], y_dims[1],
+      platform::errors::InvalidArgument(
+          "The last dimension of Input(W) should be equal to "
+          "the second dimension of Input(Y). But received the last dimension "
+          "of Input(W) is %d, the second dimension of Input(Y) is %d.",
+          w_dims[2], y_dims[1]));
 
   int64_t out_dim_0 = -1;
   int64_t tmp_dim_0 = -1;
@@ -63,27 +83,52 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
     framework::Variable* x_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
     const auto& x_lod = x_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(x_lod.empty(), false, "The Input(X) must hold lod info.");
+    PADDLE_ENFORCE_EQ(x_lod.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) should hold LoD information, but "
+                          "received Input(X).lod() is empty."));
     const auto& x_lod_0 = x_lod[0];
     PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
-                      "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod_0.back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X)'s LoD data should be "
+                          "equal to 2, but received %d.",
+                          x_lod_0.size()));
+    PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+                      platform::errors::InvalidArgument(
+                          "The last element of Input(X)'s LoD data should be "
+                          "equal to the first dimension of Input(X). "
+                          "But received the last element of Input(X)'s LoD "
+                          "data is %d, the first dimension of Input(X) is %d.",
+                          x_lod_0.back(), x_dims[0]));
 
     framework::Variable* y_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("Y")[0]);
     const auto& y_lod = y_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(y_lod.empty(), false, "The Input(Y) must hold lod info.");
+    PADDLE_ENFORCE_EQ(y_lod.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "The Input(Y) should hold LoD information, but "
+                          "received Input(Y).lod() is empty."));
     const auto& y_lod_0 = y_lod[0];
     PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
-                      "The Input(Y)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        y_dims[0], static_cast<int64_t>(y_lod_0.back()),
-        "The Input(Y)'s lod info mismatches the actual tensor shape.");
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(Y)'s LoD data should be "
+                          "equal to 2, but received %d.",
+                          y_lod_0.size()));
+    PADDLE_ENFORCE_EQ(y_dims[0], static_cast<int64_t>(y_lod_0.back()),
+                      platform::errors::InvalidArgument(
+                          "The last element of Input(Y)'s LoD data should be "
+                          "equal to the first dimension of Input(Y). "
+                          "But received the last element of Input(Y)'s LoD "
+                          "data is %d, the first dimension of Input(Y) is %d.",
+                          y_lod_0.back(), y_dims[0]));
 
     PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(),
-                      "The Length of X and Y must be equal.");
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X)'s and Input(Y)'s LoD "
+                          "data should be equal. "
+                          "But received the dimensions of Input(X)'s LoD is "
+                          "%d, the dimensions of Input(Y)'s LoD is %d.",
+                          x_lod_0.size(), y_lod_0.size()));
 
     out_dim_0 = 0;
     for (size_t i = 1; i < x_lod_0.size(); i++) {
@@ -98,10 +143,18 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
     // compile time
     framework::VarDesc* x_desc =
         BOOST_GET(framework::VarDesc*, ctx->GetInputVarPtrs("X")[0]);
-    PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
+    PADDLE_ENFORCE_GE(
+        x_desc->GetLoDLevel(), 1,
+        platform::errors::InvalidArgument("The LoD level of Input(X) should be "
+                                          "greater than 1, but reviced %d.",
+                                          x_desc->GetLoDLevel()));
     framework::VarDesc* y_desc =
         BOOST_GET(framework::VarDesc*, ctx->GetInputVarPtrs("Y")[0]);
-    PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1);
+    PADDLE_ENFORCE_GE(
+        y_desc->GetLoDLevel(), 1,
+        platform::errors::InvalidArgument("The LoD level of Input(Y) should be "
+                                          "greater than 1, but reviced %d.",
+                                          y_desc->GetLoDLevel()));
     ctx->ShareLoD("X", "Out");
   }
 
@@ -115,14 +168,11 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
 
 void MatchMatrixTensorOpGrad::InferShape(
     framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "Input(X) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Input(Y) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "Input(W) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                    "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "match_matrix_tensor_grad");
+  OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "match_matrix_tensor_grad");
+  OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "match_matrix_tensor_grad");
+  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                 "Out@GRAD", "match_matrix_tensor_grad");
 
   if (ctx->HasOutput(framework::GradVarName("X"))) {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 550de1aadde2935fae34226dba78cc06d82cd1f3..5271da91b8c151dfef50810c4ae46bdf27d7a2af 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -13,8 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-#include <algorithm>
-#include <map>
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index 7ea8eb8b00db328ca13d3d33d751aca4eac66dae..97ce3d3f87832494900f041773bbdee1fb0639af 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
+
 #include <gtest/gtest.h>
-#include <vector>
 
 void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                        paddle::framework::LoDTensor* scores,
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
index 2a7ce83967f0f74f4c2178dd4277e6a1687b5ec7..3bc1b4f4048eb02b1417e697c98f1e93b1106093 100644
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/math/blas.h"
-
 #include <utility>
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 42a60e9220cf848ba766a19cb7b4d13edc460c11..562e2de3bd3111c3cdea1b18c1fa64f6bd9911ea 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -17,6 +17,13 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index a0464cf70e2dcc44c42fc2ca7440680ef8a53e6e..aeafe22235c0954d16a73ac242ccb9e54a15413b 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
   });
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::float16 alpha,
+    const platform::float16 *A, const platform::float16 *B,
+    platform::float16 beta, platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 515d6a2435e86fe07ffe1309628ef2fbeefdc6f0..c53c453897fbae76745fd5c3aaf3e3a836527ded 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -695,9 +695,9 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
     T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
 #ifdef PADDLE_WITH_MKLML
-  const int lda = std::max((transA == CblasNoTrans) ? K : M, 1);
-  const int ldb = std::max((transB == CblasNoTrans) ? N : K, 1);
-  const int ldc = std::max(N, 1);
+  const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
+  const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
+  const int ldc = (std::max)(N, 1);
   CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, A,
                        &lda, B, &ldb, &beta, C, &ldc, 1 /* group_count */,
                        &batchCount);
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index b6bd58d118e9fc8b2d8f831660a3c93f8c59e605..3b0c3c1686af6fd9da6864b535c6c75e252485ea 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -13,7 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 270a9d3f80a80d5ea2c8b97d4a69125355ddef61..094e2059c4d4c4b16c79817bac9dc84703a2b013 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc
index 537d0b47868fb68d59e3b3556a54ba85d5f06960..927d610e2ce47e4206615ee435377d01d8ea9db3 100644
--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/context_project.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 051c6019d74f7d2820dc0ba668da3cafe8864346..08bb555c593787cd733a0c81efda5e72e0929f93 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
+
 #include "paddle/fluid/operators/math/im2col.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc
index cbe16999124a67ac764afade92f7320a12129cd1..f7770050bee442ae3bcfe826be606a85d35608eb 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h
index d74662e68e770f0b5d83a150e60156a61cefcf58..9a24bfc3312665be296f2a6c89da27758d43550e 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.h
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <stdlib.h>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
@@ -43,6 +44,7 @@ struct CosSimFunctor {
         tep_x = x[i];
         tep_y = y[i];
         xx += tep_x * tep_x;
+
         yy += tep_y * tep_y;
         xy += tep_x * tep_y;
       }
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 925f3b6161ae8506107f917196e77ecb2d9c5593..eb7c622e596ce4781f30bb6cc071a5a7adb09350 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cmath>
 #include <functional>
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 6490d81cec7c70ee5332c2667a5fb79f01e10311..07fe9c30f39bf44c3ec7aa33a3f3dd63f50a1e9a 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -16,12 +16,10 @@ limitations under the License. */
 #include <cstring>
 #include <random>
 #include <vector>
-#include "gflags/gflags.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/port.h"
 
 inline double GetCurrentUS() {
   struct timeval time;
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index 7a1ed47d182a080e10aa7ce5d95f605eb2a151a1..23840143a443da106349214966aee78f50b25088 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index 07c5cbf33378e6f6cee8a82448f55399966a2574..4b8a6274cceaccaad9000c8d0a415a486b44b2f0 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -10,10 +10,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/gru_compute.h"
+
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 6fb393d791cc2a077dbcd0a912bcf31b5d59ad65..8efd35ca108100e4d224890846433433702c57a9 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
-#include <vector>
+
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
index 94bbcbb50670d9f0b11b77cf6a54a99c227521bf..7e74f6880196d343b2901fd86e5bc82f12247d5c 100644
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -13,9 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/lstm_compute.h"
+
 #include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/lstm_kernel.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index d6f51c6e5c693becb14ff0bac0088bb9dc2b2f55..7c50ba630dbd91ef8c6d51cbde862336b5ab83cb 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
-#include <iostream>
-#include <map>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 614f89a048c4e92e758ddb39da43322be284f9e5..950aed0aa497416665d4ebc8b7dfc95f812bfb7d 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -12,14 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/blas.h"
 
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename DeviceContext, typename T>
+class MatrixInverseFunctor;
+
 template <typename T>
 class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
  public:
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 1e86c2e7a32bcc6fb4e080885dfdfb94e4f8a2e2..40cea7483f39781a46689ecd1aa21d0bf8cf2c07 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/pooling.h"
 #include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc
index 99aa318453eae161807353198a78e11085cd6237..16342493e45977e1b42b60be6857a1b6116cf121 100644
--- a/paddle/fluid/operators/math/sample_prob.cc
+++ b/paddle/fluid/operators/math/sample_prob.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sample_prob.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index e5a6d84cb2b0527c606e62a19ef02d669945ecb1..7b08df660a0bb74b3f641e5aef7275b28ca9d9d7 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -16,10 +16,17 @@ limitations under the License. */
 #include <iostream>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/sampler.h"
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bb2b6db100b65dae175af1738a9592b1c4212a9a
--- /dev/null
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -0,0 +1,365 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, typename Index, int DimTileSize>
+__global__ void SegmentMeanCustomKernel(
+    const Index* segment_ids, const T* input, T* output, T* summed_ids,
+    const Index input_length_size, const Index inner_dim_size,
+    const Index output_length_size, const Index total_stripe_count) {
+  CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
+    const Index segment_offset = stripe_index % inner_dim_size;
+    const Index dim_index_base =
+        stripe_index / inner_dim_size * Index(DimTileSize);
+    const Index actual_height =
+        min(Index(DimTileSize), input_length_size - dim_index_base);
+
+    Index first_segment_id = segment_ids[dim_index_base];
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
+    }
+    if (segment_offset == 0) {
+      T sum = T(0);
+      for (Index j = 0; j < actual_height; j++) {
+        Index current_segment_id = segment_ids[dim_index_base + j];
+        // Note(ZHUI): following check may cause
+        // cudaErrorLaunchOutOfResources.
+        // PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+        //               "the segment ids should be sorted, but got "
+        //               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+        //               dim_index_base + j - 1, dim_index_base + j,
+        //               last_segment_id, current_segment_id);
+
+        if (j > 0 && current_segment_id > last_segment_id) {
+          if (last_segment_id == first_segment_id) {
+            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+          } else {
+            *(summed_ids + last_segment_id) = sum;
+          }
+          sum = T(0);
+        }
+        sum += T(1);
+        last_segment_id = current_segment_id;
+      }
+      platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    }
+    // ensure last_segment_id is the largest
+    last_segment_id = output_length_size;
+    __syncthreads();
+    T sum = T(0);
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      if (current_segment_id > last_segment_id) {
+        const Index output_index =
+            last_segment_id * inner_dim_size + segment_offset;
+        if (last_segment_id == first_segment_id) {
+          platform::CudaAtomicAdd(output + output_index,
+                                  sum / *(summed_ids + last_segment_id));
+        } else {
+          *(output + output_index) = sum / *(summed_ids + last_segment_id);
+        }
+        sum = T(0);
+      }
+      sum += input[(dim_index_base + j) * inner_dim_size + segment_offset];
+      last_segment_id = current_segment_id;
+    }
+    const Index output_index =
+        last_segment_id * inner_dim_size + segment_offset;
+    platform::CudaAtomicAdd(output + output_index,
+                            sum / *(summed_ids + last_segment_id));
+  }
+}
+
+template <typename T, typename Index, typename Helper, typename Pool>
+__global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
+                                 T* output, Helper h, Pool pool) {
+  CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
+    Index segment_offset, dim_index_base, actual_height;
+    Index inner_dim_size = h.inner_dim_size;
+    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+
+    T minmax = pool.initial();
+    Index first_segment_id = segment_ids[dim_index_base];
+    // -1 is for the start value when interval_id = 0
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
+    }
+
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      // ensure the segment_ids is sorted.
+      PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+                     "The segment ids should be sorted, but got "
+                     "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                     dim_index_base + j - 1, dim_index_base + j,
+                     last_segment_id, current_segment_id);
+
+      if (current_segment_id > last_segment_id) {
+        // reset the interval value which do not have corresponding ids.
+        for (Index interval_id = last_segment_id + 1;
+             interval_id < current_segment_id; ++interval_id) {
+          *(output + interval_id * inner_dim_size + segment_offset) = 0;
+        }
+        // don't update result when j=0
+        if (j > 0) {
+          const Index output_index =
+              last_segment_id * inner_dim_size + segment_offset;
+          if (last_segment_id == first_segment_id) {
+            pool.atomic(output + output_index, minmax);
+          } else {
+            *(output + output_index) = minmax;
+          }
+          minmax = pool.initial();
+        }
+      }
+      pool.compute(
+          input[(dim_index_base + j) * inner_dim_size + segment_offset],
+          &minmax);
+      last_segment_id = current_segment_id;
+    }
+    const Index output_index =
+        last_segment_id * inner_dim_size + segment_offset;
+    pool.atomic(output + output_index, minmax);
+  }
+}
+
+template <typename T, typename Index, typename Helper>
+__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
+                                       const T* output, const T* out_grad,
+                                       T* in_grad, Helper h) {
+  CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
+    Index segment_offset, dim_index_base, actual_height;
+    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      Index input_index =
+          (dim_index_base + j) * h.inner_dim_size + segment_offset;
+      Index output_index =
+          current_segment_id * h.inner_dim_size + segment_offset;
+      if (input[input_index] == output[output_index]) {
+        in_grad[input_index] = out_grad[output_index];
+      }
+    }
+  }
+}
+
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicMax(address, val);
+  }
+};
+
+template <class T>
+class MinPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicMin(address, val);
+  }
+};
+
+template <class T>
+class SumPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicAdd(address, val);
+  }
+};
+
+template <class T>
+class ArrangeHelper {
+ public:
+  const T input_total_size;
+  const T input_length_size;
+  const T output_length_size;
+  T inner_dim_size;
+  T total_stripe_count;
+  const T DimTileSize = 8;
+
+  ArrangeHelper(T a, T b, T c)
+      : input_total_size(a), input_length_size(b), output_length_size(c) {
+    T input_outer_dim_num_stripe =
+        (input_length_size + DimTileSize - 1) / DimTileSize;
+    inner_dim_size = input_total_size / input_length_size;
+    total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
+  }
+
+  DEVICE inline void calculate(T stripe_index, T& segment_offset,
+                               T& dim_index_base, T& actual_height) {
+    segment_offset = stripe_index % inner_dim_size;
+    dim_index_base = stripe_index / inner_dim_size * DimTileSize;
+    actual_height = min(DimTileSize, input_length_size - dim_index_base);
+  }
+};
+
+template <typename T, typename Index>
+void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
+                                const framework::Tensor& input,
+                                const framework::Tensor& segment_ids,
+                                const framework::Tensor& output,
+                                const framework::Tensor& out_grad,
+                                framework::Tensor* in_grad,
+                                const std::string pooltype = "SUM") {
+  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
+                                output.dims()[0]);
+  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  if (pooltype == "MAX" || pooltype == "MIN") {
+    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
+        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
+        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
+        out_grad.data<T>(), in_grad->data<T>(), h);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unsupported segment pooling grad operation, Only MAX, MIN "
+        "available, but got %s.",
+        pooltype));
+  }
+}
+
+template <typename T>
+__global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
+  for (int i = blockIdx.x; i < len; i += gridDim.x) {
+    __shared__ T y_i;
+    auto base = i * dim;
+    if (threadIdx.x == 0) {
+      y_i = y[i];
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < dim; j += blockDim.x) {
+      x[base + j] /= y_i;
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& input,
+                  const framework::Tensor& segment_ids,
+                  framework::Tensor* output,
+                  framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM") {
+    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
+                                   output->dims()[0]);
+    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    if (pooltype == "MEAN") {
+      SegmentMeanCustomKernel<
+          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                  config.thread_per_block.x, 0, ctx.stream()>>>(
+          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
+          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
+          h.output_length_size, h.total_stripe_count);
+    } else if (pooltype == "SUM") {
+      SumPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else if (pooltype == "MAX") {
+      MaxPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else if (pooltype == "MIN") {
+      MinPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
+          "available, but got %s.",
+          pooltype));
+    }
+  }
+};
+
+template <typename T, typename IndexT>
+class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM") {
+    if (pooltype == "MAX" || pooltype == "MIN") {
+      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
+                                            out_grad, in_grad, pooltype);
+    } else if (pooltype == "MEAN") {
+      framework::Tensor mean_grad;
+      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
+      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      int len = output.dims()[0];
+      int dim = output.numel() / len;
+      auto config = platform::GetGpuLaunchConfig1D(context, len);
+      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                     context.stream()>>>(mean_grad.data<T>(),
+                                         summed_ids->data<T>(), len, dim);
+      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+    } else if (pooltype == "SUM") {
+      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
+          "available, but got %s.",
+          pooltype));
+    }
+  }
+};
+
+using CUDA = paddle::platform::CUDADeviceContext;
+template class SegmentPoolFunctor<CUDA, float, int>;
+template class SegmentPoolFunctor<CUDA, float, int64_t>;
+template class SegmentPoolFunctor<CUDA, double, int>;
+template class SegmentPoolFunctor<CUDA, double, int64_t>;
+template class SegmentPoolGradFunctor<CUDA, float, int>;
+template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
+template class SegmentPoolGradFunctor<CUDA, double, int>;
+template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 81ad620466ee3d9fcd9d3e057cfd0dd9053089f0..5cb1cc5dc037146bbfa17f81753aee0c02997477 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index 300a3692012ab9631d7049d2042e91fb99ad3c21..852700fa7ff3c1f992eb48b5c684cfaad96cfc31 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 076df0176429c7bbd350698af0137fbcca18f806..e29313e9f742ca27f56649278b6a1583e8cabb02 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -14,6 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index eab4553ae8b9745b71a21198d0e5ccf1b8a62a35..8892a17886a736d8a4490d1240861e178afc27d4 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
+
 #include <gtest/gtest.h>
-#include <vector>
 
 template <typename DeviceContext, typename T>
 void TestSequencePadding(const DeviceContext &context,
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index ee5b22ca855b4fa26e9626aadb84fa9b93b72952..78cbdf311ad1982a5de8913e3fb432e2d824b643 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
index 202243985c125cd518a27477eb370bf1a325fe16..d84513e024d7f62195fc3153e1f6e446104d0359 100644
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -40,6 +46,7 @@ namespace math {
  *                      total_sequence_length is the sum of all sequences'
  *                      length.
  * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
+
  * \param num_seq       Number of sequence
  *
  */
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 794fc647172b040d4e926144a87b84eb4e5216b0..42bf1f471deb5238fdb34dcd9284972930305f58 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
-#include <vector>
+
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index aa979c4f10907e604758c3e2cfb776cb994c9ceb..6ed5a0943ebb329d15521f2610cecfba16a782d1 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
+
 #include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
 
 template <typename DeviceContext, typename Place>
 void testVol2col() {
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
index 64ec65a23419725c7cc481beadb9383402a426bd..91958513ddb3c9923487e5de86f188bc3a0a6f65 100644
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -17,10 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plf = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(matmul_v2,
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_v2, ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
-    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>);
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index dc83e4d964815ec46452bb0086cf17437b3846a4..ee485bd1711e21b86cdf65fdb2f5f0793e42beb4 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -65,7 +65,7 @@ static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
                                  std::int64_t* x_bd_dims,
                                  std::int64_t* y_bd_dims,
                                  std::int64_t* out_bd_dims) {
-  const int ndim = std::max(x_ndim, y_ndim);
+  const int ndim = (std::max)(x_ndim, y_ndim);
   std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
   std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
   std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
@@ -79,7 +79,7 @@ static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
     if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
       out_bd_dims[i] = 0;
     } else {
-      out_bd_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
+      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
     }
   }
 }
@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
     if (trans_y) {
       const int M = Y->numel() / N;
       VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), y_data, x_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       const int M = y_dims[y_ndim - 1];
       const int batch_size = Y->numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), y_data, x_data,
+                  static_cast<T>(0), Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data,
-                         x_data, 0, Out->data<T>(), batch_size, M * N, 0);
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
+                         y_data, x_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
       }
     }
     return;
@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
       const int batch_size = X->numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), x_data, y_data,
+                  static_cast<T>(0), Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data,
-                         y_data, 0, Out->data<T>(), batch_size, M * N, 0);
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
+                         x_data, y_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
       }
     } else {
       const int M = X->numel() / N;
       VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), x_data, y_data,
+                static_cast<T>(0), Out->data<T>());
     }
     return;
   }
@@ -229,7 +235,7 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
                                                  "Input(X) has error dim."));
   }
   const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = std::max(x_ndim, y_ndim);
+  const int ndim = (std::max)(x_ndim, y_ndim);
   std::vector<std::int64_t> x_broadcast_dims(ndim);
   std::vector<std::int64_t> y_broadcast_dims(ndim);
   std::vector<std::int64_t> out_broadcast_dims(ndim);
@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
   if (x_batch_size == 1 && y_batch_size == 1) {
     VLOG(3) << "MatMul's case 8";
     blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
-              y_data, 0.0f, Out->data<T>());
+              trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast<T>(1),
+              x_data, y_data, static_cast<T>(0), Out->data<T>());
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f,
-                Out->data<T>());
+      blas.GEMV(false, y_batch_size * N, K, static_cast<T>(1), y_data, x_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 10";
       blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
-                       x_data, y_data, 0, Out->data<T>(), out_batch_size, 0,
-                       K * N);
+                       trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                       Out->data<T>(), out_batch_size, 0, K * N);
     }
   } else if (y_batch_size == 1) {
     if (!trans_x) {
       VLOG(3) << "MatMul's case 11";
       blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f,
-                Out->data<T>());
+                x_batch_size * M, N, K, static_cast<T>(1), x_data, y_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 12";
       blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                       1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size,
-                       M * K, 0);
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                       Out->data<T>(), out_batch_size, M * K, 0);
     }
   } else if (!is_broadcast_dims) {
     VLOG(3) << "MatMul's case 13";
     blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
-                     y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N);
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                     static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                     Out->data<T>(), out_batch_size, M * K, K * N);
   } else {
     // in the case, can't use stridedgemm
     std::vector<const T*> x_ptr(out_batch_size);
@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
     }
     VLOG(3) << "MatMul's case 14";
     blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
-                     x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(),
-                     out_batch_size);
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                     static_cast<T>(1), x_ptr.data(), y_ptr.data(),
+                     static_cast<T>(0), out_ptr.data(), out_batch_size);
   }
 }
 
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index b47ec8bc70a207c5f662e9a7ee0cdd944936e7a0..4f73de086f2a9f32ee07d644e8c470234cc947ba 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -12,10 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index c9b852cfc05127a4bbf00ea23a751c59dc2d109d..584de34c5d329c184eb7d26c6efad82ed4978fb2 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -13,7 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
@@ -44,8 +55,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
-                   "Input(InTrue) or Input(InFalse) should be initialized.");
+    PADDLE_ENFORCE_EQ(
+        in_true.numel() || in_false.numel(), true,
+        platform::errors::InvalidArgument(
+            "Input(InTrue) or Input(InFalse) should be initialized."));
 
     auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
@@ -56,7 +69,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
                             cpu_mask.get());
 #else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Not supported GPU, Please recompile or reinstall paddle with CUDA "
+          "support."));
 #endif
     }
     auto *mask_data = cpu_mask->data<bool>();
@@ -109,7 +124,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       size_t start_offset = lod_and_offset.second.first;
       size_t end_offset = lod_and_offset.second.second;
 
-      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      PADDLE_ENFORCE_GE(end_offset, start_offset,
+                        platform::errors::InvalidArgument(
+                            "The end offset less than start offset, end offset "
+                            "is %d, start offset is %d.",
+                            end_offset, start_offset));
       size_t len = end_offset - start_offset;
       if (len == 0) {
         continue;
@@ -189,22 +208,24 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
                    "merge_lod_tensor");
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2,
-                      "If you are using IfElse OP:"
-                      "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                      "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                      "Please ensure that the cond should be a 2-D tensor and "
-                      "the second dim size of cond should be 1. "
-                      "But now the cond's shape is [",
-                      *mask_dim.Get(), "].\n");
+                      platform::errors::InvalidArgument(
+                          "If you are using IfElse OP:"
+                          "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                          "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                          "Please ensure that the cond is a 2-D tensor and "
+                          "the second dim size of cond is 1. "
+                          "But now the cond's shape is [%s].\n",
+                          mask_dim));
     if (context->IsRuntime() || mask_dim[1] > 0) {
       PADDLE_ENFORCE_EQ(mask_dim[1], 1,
-                        "If you are using IfElse OP:"
-                        "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                        "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                        "Please ensure that the cond should be a 2-D tensor "
-                        "and the second dim size of cond should be 1. "
-                        "But now the cond's shape is [",
-                        *mask_dim.Get(), "].\n");
+                        platform::errors::InvalidArgument(
+                            "If you are using IfElse OP:"
+                            "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                            "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                            "Please ensure that the cond is a 2-D tensor "
+                            "and the second dim size of cond is 1. "
+                            "But now the cond's shape is [%s].\n",
+                            mask_dim));
     }
 
     context->SetOutputDim("Out", context->GetInputDim("InTrue"));
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 487deb11b48687a91174c8d9baf072a5ca929de8..aecf67fc3bb1d91c96ace38c8e7d83ea6af3a88b 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -15,6 +15,15 @@
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 8a02a697cbb21b28e14f19c6202ae0777b5102de..98f368aa7a90859121a06e42705aee6355182b27 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -12,10 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 19ee8764e27b235a2fa8e0720c11bce601b030db..a6cda154e55b972fc653cffc4815f9e0f6e975de 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -12,12 +12,16 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <unordered_map>
 #include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 40737f4cd029b47dbd03069a2e4d29ad33121eb9..0bec5619f5427adf9c38bbdd6480b6da28894b8a 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <mkldnn/include/mkldnn_types.h>
 #include <memory>
-#include "paddle/fluid/framework/tensor.h"
+
 #include "paddle/fluid/operators/fc_op.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 00c10cecbf4828d2157505abca49763b6ded1b16..9ee653ec58912b326dc44f3f2289bd9dac6b3c62 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -12,10 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/lrn_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 5ca0ed1182e74e681e9e36e55b61f58b5da66170..3ae34fe0e9011940dabd85661e3c122e953d24c3 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "mkldnn.hpp"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
+namespace paddle {
+namespace platform {
+class MKLDNNDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index 1dd1ad117862d92aa8d358f04f8b03fec7abafff..4f0b7cab47efe33fbb297542975590b44f62c75a 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/memory/malloc.h"
+
 #include "paddle/fluid/operators/mul_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 9df30b3295c00e69a956ee84770dfeb19a83487c..bf12c61a4d9b16ffa41a44a1b53d6ec9daa3717d 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 5014381a4e215917883f45288de4482db5cbf79c..0b159f9dcfaafde04dec862749d724ed02554978 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <iostream>
-#include <numeric>
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 1e0e13abb7c641d441b9c6188f7b9103c4ec7292..414312fe97ecb7d9ca1c53022e4340e8aed7b8a8 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -24,13 +24,19 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index 70d80e26e5c6c51f4f0fbac304f48c5f4b3f62b2..169af47e95acdcc9450a892cd1dddc0d3ec9de1e 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 07333f1ae11c3889b543ca6d327e480607a4bcea..02dcb4045f4cdee6840f5caef98d7329e706eaf2 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
 //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
 //          np.sum(dy,
 //          axis=(n,h,w)) * (x - mean) *
-//          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+//          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
 //          inv_var
 //          *
 //          np.mean(dy, axis=(n,h,w)) -
 //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-//          axis=(n,h,w))))
+//          axis=(n,h,w)))
 
 template <typename T, int BlockDim, framework::DataLayout layout>
 __global__ void DoubleGradComputeDX(const T *x, const T *mean,
@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
                 ? (j / sample_size * C + i) * sample_size + j % sample_size
                 : j * outer_size + i;
         dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
-                      (x[index] - mean_val) * var_val *
+                      (x[index] - mean_val) * var_val * var_val *
                           dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
                      ddscale[i];
       }
@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
 }
 
 // math: dx = ddscale * dy * inv_var
-// math: ddy = scale * ddx * inv_var
 template <typename T, framework::DataLayout layout>
-__global__ void DoubleGradComputeDataWithGlobal(
-    const T *dy, const T *scale, const T *variance, const double epsilon,
-    const int C, const int sample_size, const int num, T *dx) {
+__global__ void DoubleGradComputeDXWithGlobal(const T *dy, const T *ddscale,
+                                              const T *variance,
+                                              const double epsilon, const int C,
+                                              const int sample_size,
+                                              const int num, T *dx) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
-  if (scale != nullptr) {
+  if (ddscale != nullptr) {
     for (int i = gid; i < num; i += stride) {
       const int c =
           layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
       T inv_var = 1.0 / sqrt(variance[c] + epsilon);
-      dx[i] = dy[i] * scale[c] * inv_var;
+      dx[i] = dy[i] * ddscale[c] * inv_var;
+    }
+  }
+}
+
+// math: ddy = scale * ddx * inv_var + ddbias +
+//             ddscale * (x - mean) * inv_var
+template <typename T, framework::DataLayout layout>
+__global__ void DoubleGradComputeDDYWithGlobal(
+    const T *ddx, const T *scale, const T *mean, const T *variance, const T *x,
+    const T *ddbias, const T *ddscale, const double epsilon, const int C,
+    const int sample_size, const int num, T *ddy) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  if (ddx != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += ddx[i] * scale[c] * inv_var;
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += (x[i] - mean[c]) * inv_var * ddscale[c];
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      ddy[i] += ddbias[c];
     }
   }
 }
@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 
   const T *mean_data, *variance_data;
   if (use_global_stats) {
+    const auto *running_mean = ctx.Input<Tensor>("Mean");
     const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean_data = running_mean->template data<T>();
     const auto *running_var_data = running_var->template data<T>();
+    mean_data = running_mean_data;
     variance_data = running_var_data;
   } else {
     const T *smean_data = Saved_mean->data<T>();
@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
             T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
             T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
             T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
-            ddy_data);
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
       } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
             T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
-            ddy_data);
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index 67d71fe82ad0b52906312d3e214e73d56012824f..7c1cf9109c566625743f69de8cf3213855600c69 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include <string>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
 
 USE_OP(elementwise_add_grad);
 
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 5c6c38da92808f05c90e7dad2482e7c7364a1f80..eb41d21e09218b203f887d8fd812d46dc8367c71 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -23,46 +23,54 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
+                   "DecayedAdagradOp");
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Param").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Grad").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));
+
+    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
+                   "DecayedAdagradOp");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
+                      platform::errors::InvalidArgument(
+                          "LearningRate should have one element"));
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
-                      "Param and Grad input of DecayedAdagradOp should have "
-                      "the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
-                      "Param and Moment input of DecayedAdagradOp should have "
-                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        platform::errors::InvalidArgument(
+            "Param and Grad input of DecayedAdagradOp should have "
+            "the same dimension."));
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        platform::errors::InvalidArgument(
+            "Param and Moment input of DecayedAdagradOp should have "
+            "the same dimension."));
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("MomentOut", param_dims);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
index 279edfb015c26848d4078975a40bdca650bdc6a0..f264ebf8a32636a1e2076f8721b3c95d65f5382b 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -24,17 +24,19 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
     const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
index b579b5143ddbe6221738f9864f13fb7bea4ac509..55775bc08fb5ebc31cd231b8088a9798561fabfc 100755
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -30,7 +30,12 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
     auto* grad_var = ctx.InputVar("Grad");
     // only support dense for now.
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
     auto grad = ctx.Input<framework::LoDTensor>("Grad");
 
     param_out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 70d232ad6a51e21b863974e70920eb2d9da895e6..111a82c6cce78b0b6fa462c332d9bd3dd6ce6e43 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -46,14 +46,14 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
           platform::errors::InvalidArgument(
               "The square of upscale_factor[%u] should divide the "
               "number of channel[%u]",
-              input_dims[1], upscale_factor * upscale_factor));
+              upscale_factor * upscale_factor, input_dims[1]));
     } else {
       PADDLE_ENFORCE_EQ(
           input_dims[3] % (upscale_factor * upscale_factor), 0,
           platform::errors::InvalidArgument(
               "The square of upscale_factor[%u] should divide the "
               "number of channel[%u]",
-              input_dims[3], upscale_factor * upscale_factor));
+              upscale_factor * upscale_factor, input_dims[3]));
     }
     auto output_dims = input_dims;
     output_dims[0] = input_dims[0];
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 3fb18365e523d75f5c3e6107011d2aac5a152dea..80faf833be591d0f4e2e7ee365d0163869672c02 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -12,13 +12,21 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <algorithm>
-#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/assign_op.h"
 #include "paddle/fluid/operators/tensor_formatter.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 using framework::GradVarName;
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 0cae060bc8e87f5ad249158f98f18d7127988151..c50a8b731d5a51943d68c688339c0dd609d71630 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -285,13 +285,21 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
     if (use_filter) {
       if (white_list_len != 0) {
         _filter = (math::bloomfilter*)_blobs_1->data<float>();
-        PADDLE_ENFORCE_EQ(math::bloomfilter_check(_filter), 1,
-                          "white filter not load");
+        PADDLE_ENFORCE_EQ(
+            math::bloomfilter_check(_filter), 1,
+            platform::errors::PreconditionNotMet(
+                "The white filter is not loaded successfully, please make sure "
+                "'white_list_len': %d is valid for Input(WhiteList).",
+                white_list_len));
       }
       if (black_list_len != 0) {
         _black_filter = (math::bloomfilter*)_blobs_2->data<float>();
-        PADDLE_ENFORCE_EQ(math::bloomfilter_check(_black_filter), 1,
-                          "black filter not load");
+        PADDLE_ENFORCE_EQ(
+            math::bloomfilter_check(_black_filter), 1,
+            platform::errors::PreconditionNotMet(
+                "The black filter is not loaded successfully, please make sure "
+                "'black_list_len': %d is valid for Input(BlackList).",
+                black_list_len));
       }
     }
 
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 9b7a923fb4bbcfb6e525f8360cb692c003555260..ec9d1fde4533580f862e35d01fbdb6dd0143495a 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,9 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 42c087b9e47a9ec7e80d05a791af3e04c483ab08..041d36a93432e793a79e3d47df95c80e7a10b460 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <queue>
 #include <vector>
+
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
index 2100aeb7cf4d5ca5ae0ae557c5d131ca831c39f9..ad79f6bbc4c4a82eff107e42fa60f139ccd5a818 100644
--- a/paddle/fluid/operators/reader/py_reader.cc
+++ b/paddle/fluid/operators/reader/py_reader.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/py_reader.h"
-#include <memory>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h
index d827cdd0b8764b1d6cfa0477dc85cd44525b0b29..3492d57804886309ebf1b63e8c161bc76d0c5abd 100644
--- a/paddle/fluid/operators/reader/py_reader.h
+++ b/paddle/fluid/operators/reader/py_reader.h
@@ -17,6 +17,7 @@
 #include <atomic>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
@@ -24,6 +25,8 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
+class LoDTensorBlockingQueue;
+
 class PyReader : public framework::FileReader {
  public:
   explicit PyReader(
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index dc0940ac0b78d295b5088cb6ae26300da1dc883d..86c9f38ad3f9793386d100218ea3854579b884df 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -12,12 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <chrono>  // NOLINT
-#include <set>
 #include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 
 using paddle::operators::reader::BlockingQueue;
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 952ed4662880053833384799916fe5435d867ed0..bccaae70229b50446cc5743995cc655852529b68 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
-#include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 35a0dacaec2376db67484bf934dc523cf105f363..fec496446ac01844f52eec55f9da666c76470489 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -18,9 +18,17 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class ReaderBase;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace reader {
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index c1adaf2037a34794218368ad46563efaca24e2ba..35f52ffa522f4c497a493b7e93736f9f522beb19 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/recurrent_op.h"
-
 #include <algorithm>
-#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/recurrent_op.h b/paddle/fluid/operators/recurrent_op.h
index 1ca66527e1b834a75f6edd1c23d7657fd52bbae7..e3f512d45c05516b0919d623b626e77f0c790a67 100644
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
@@ -22,6 +22,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index 57df2664824d478503fce04f09c5a7f1e02eb080..2d7cce68e81719f2187bb5ce030fd7ae2b0f5755 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -13,9 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 30265b3cc71fc6c587a7f4c716529962e1556f45..10095bc955047aa3f01cea38dbbfa87d0f842c5a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -14,6 +14,21 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
 REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index cbc18f18b8e5534b37294dbfb8630bac906e8066..f288fce753802a8c5c871f573680c99e461663d6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -14,6 +14,21 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
 REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index 88935107df187da731e5b77bb6c24cd692d2994f..f27cd6b125b3246b822ec4bef1e227ccff7dfc3e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -14,6 +14,19 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 REGISTER_REDUCE_OP(reduce_prod);
 REGISTER_OP_CPU_KERNEL(reduce_prod,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 54818470b277443e411ea5f7d9c7561eddc7046a..a3850c5e264548c8b886c9c1c1e12500c1436ffe 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -13,9 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index cb1a2962d9b1412086ad87e00131f0d4bdf9db71..d8d4e641aeb3ef660d092c404bf0edf91014b4bb 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -12,10 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDRankTable;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 01a33a46521cd81d084f8971c47741b28a105d41..e03824ca8c3f428de3c89f0bc4f623379bb564f1 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -13,9 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
-#include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index eea2d2ac57ad3aba90aced8a3ff4816d3a64123f..95b23a0b8cceefaa82f2061e9b61b83a15cb6d07 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -15,6 +15,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class RNNMemoryHelperOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 9d51f3e292fa2de114b971032c29f5e769ba617a..55e35e43eb9fc95b423f2d028242ff439e97c4c2 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -14,9 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 
-#include <memory>
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index eb248e59b6ce6e5c9c04f94b21e4bc14207c39b1..c83726180baeae6f4b73adda3bd9d9127b0f3e26 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scatter.h"
+
 #include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-#include "paddle/fluid/framework/ddim.h"
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dc92d7fcc3a87c467fa118a20ec1ab0cfbceba18
--- /dev/null
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/segment_pool_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    segment_pool,
+    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    segment_pool_grad,
+    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index a505946b9f5229425f724ae5469beb77863e9aaf..23b0c31608d267ab6d4a81d6b9a74df0737441f0 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -63,6 +63,46 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, output, static_cast<T>(0));
   }
+#ifdef PADDLE_WITH_CUDA
+  if (!cpu_place) {
+    Tensor length;
+    length.mutable_data<IndexT>(framework::make_ddim({1}),
+                                platform::CPUPlace());
+    IndexT* length_data = length.data<IndexT>();
+    const IndexT* segment_ids = segment->data<IndexT>();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
+                   cudaMemcpyDeviceToHost));
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host, 0,
+        platform::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = input->dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    output->Resize({dims});
+    output->mutable_data<T>(context.GetPlace());
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    math::SetConstant<DeviceContext, T> setconst;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    setconst(dev_ctx, output, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids = context.Output<Tensor>("SummedIds");
+      summed_ids->Resize({dims[0], 1});
+      summed_ids->mutable_data<T>(context.GetPlace());
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
 
   SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
index 43fd84a711ff7fbf999a66bfdfe320b4daceb1a7..6eda8595b1769b0891213124ac47a3b1cf27fb7c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
@@ -13,6 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 template <typename T>
 using Kernel =
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index c01fed108f3e2d7b399379520f6b0bec906f5d13..119d2e7236946e7243ef53c791f4bb7f48d21c91 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -21,13 +21,13 @@ class ShuffleChannelOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ShuffleChannelOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ShuffleChannelOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShuffleChannelOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShuffleChannelOp");
 
     auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    PADDLE_ENFORCE_EQ(
+        input_dims.size(), 4,
+        platform::errors::InvalidArgument("The layout of input is NCHW."));
 
     ctx->SetOutputDim("Out", input_dims);
   }
@@ -53,7 +53,8 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("group", "the number of groups.")
         .SetDefault(1)
         .AddCustomChecker([](const int& group) {
-          PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0.");
+          PADDLE_ENFORCE_GE(group, 1, platform::errors::InvalidArgument(
+                                          "group should be larger than 0."));
         });
 
     AddComment(R"DOC(
@@ -76,7 +77,9 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto input_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    PADDLE_ENFORCE_EQ(
+        input_dims.size(), 4,
+        platform::errors::InvalidArgument("The layout of input is NCHW."));
 
     ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
   }
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 94e54266f0f922efef5ea4a1b23338b6ce02d131..5b857960706f01c4636d2cb5f2b4b39c12465f99 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -14,9 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index aa8c0b13dbb2e585e97cd47a0340336b01c2c408..4adbbacc844c64329c7c62f8969cdc3f42936beb 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -13,9 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index f20bada8ab288fe74fd8ca82a73522a22b234191..48d6cf8b3619a7e017319faa525ae1a3ab2f96d7 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -11,8 +11,10 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/detail/strided_memcpy.h"
+
 namespace paddle {
 namespace operators {
 
@@ -60,20 +62,33 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
   auto place = ctx.GetPlace();
 
   PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
-                    "src and dst tensor should have the same dims size.");
+                    platform::errors::InvalidArgument(
+                        "Source and destination tensor should have the same "
+                        "dimension size, but source tensor dimension size is "
+                        "%u, destination tensor size is %u.",
+                        src_stride_numel.size(), dst_stride_numel.size()));
 
   for (int64_t i = 0; i < axis; ++i) {
     if (i < axis) {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
-                        dst_stride_numel[i] / dst_stride_numel[axis],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i] / src_stride_numel[axis],
+          dst_stride_numel[i] / dst_stride_numel[axis],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i] / src_stride_numel[axis],
+              dst_stride_numel[i] / dst_stride_numel[axis]));
     } else if (i == axis) {
       continue;
     } else {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i], dst_stride_numel[i],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i], dst_stride_numel[i]));
     }
   }
 
@@ -90,7 +105,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
                    src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
 #else
-      PADDLE_THROW("Paddle is not compiled with GPU");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Paddle is not compiled with GPU."));
 #endif
     }
   }
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
index 3a450773a9d749eb3f73baa46e681e588e1fbd0f..83480b44d5be08a2bb1c335f0a82be276a60a676 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/strided_memcpy.h"
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index 7b8b484a11e010b7f003cb83ca171b907caadaea..e4fa4a96a5cf7fab122bd49b47719aea50a1da15 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -12,11 +12,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/fluid/operators/tensor_formatter.h"
 #include <algorithm>
 #include <string>
 
-#include "paddle/fluid/operators/tensor_formatter.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/tensor_formatter.h b/paddle/fluid/operators/tensor_formatter.h
index 1731348479d5f5bcf5a3d3fbfe338d3ec76edaca..aee5eec0d1c29c5f5485182da24b9a53f7d27c78 100644
--- a/paddle/fluid/operators/tensor_formatter.h
+++ b/paddle/fluid/operators/tensor_formatter.h
@@ -18,6 +18,12 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/var_type.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index 708fccf9715e41db9d56210e46a3d174c348ed63..d4eb79aa0f2fb44d41104e1dca3daec8116d02ce 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -14,9 +14,6 @@
 
 #ifdef PADDLE_WITH_CUDA
 
-#include <string>
-#include <vector>
-
 #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 9cfe47da5db7ba15c9b24a8d551606f805ad9b15..922340b08c63815e951155610c995c995c4bba9e 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -31,6 +31,18 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class TRTCalibratorEngine;
+class TRTCalibratorEngineManager;
+class TRTInt8Calibrator;
+}  // namespace tensorrt
+template <typename T>
+struct Singleton;
+}  // namespace inference
+}  // namespace paddle
+
 namespace paddle {
 
 namespace operators {
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index cc2fe4cdbdb8faa69abad28fbdd31dc4e61bdc04..ce94ba1ce9e8c10e04cd3fe5fee214ef30ffb918 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index aec995304a77118ecbf788ca3984c7e9da531f18..05d077b173a13e457fd38187b832f9586926a2ee 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
     // NOTE: Converting int64 to int32 may cause data overflow.
     vec_new_data = std::vector<T>(data, data + x->numel());
   } else {
-    PADDLE_THROW("The dtype of Tensor must be int32 or int64.");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The dtype of Tensor must be int32 or int64, but received: %s",
+        x->type()));
   }
   return vec_new_data;
 }
@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
   for (size_t i = 0; i < list_tensor.size(); ++i) {
     auto tensor = list_tensor[i];
     PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "ShapeError: The shape of Tensor in list must be [1]. "
-                      "But received the shape "
-                      "is [%s]",
-                      tensor->dims());
+                      platform::errors::InvalidArgument(
+                          "The shape of Tensor in list must be [1]. "
+                          "But received its shape "
+                          "is [%s]",
+                          tensor->dims()));
 
     if (tensor->type() == framework::proto::VarType::INT32) {
       if (platform::is_gpu_place(tensor->place())) {
@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
         vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>()));
       }
     } else {
-      PADDLE_THROW("The dtype of Tensor in list must be int32 or int64.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The dtype of Tensor in list must be int32 or int64, but received: "
+          "%s",
+          tensor->type()));
     }
   }
   return vec_new_data;
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index f8a29a52d7a3d9332b9dcb8189dfd7c1df902faa..db8b2c30501bd7f291b23728a26dcd3ea27e0ec5 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -78,21 +78,35 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
       platform::errors::NotFound("Col(Output) of VarConv2dOP is not found."));
 
   auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of X(Input) can't be less than 2.");
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "The rank of X(Input) can't be less than 2, but received rank is %u.",
+          x_dims.size()));
 
   auto w_dims = ctx->GetInputDim("W");
 
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+  PADDLE_ENFORCE_EQ(
+      w_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "Input W should be a 2-D tensor, but its actual dimension is %u.",
+          w_dims.size()));
   int output_channel = ctx->Attrs().Get<int>("OutputChannel");
   int input_channel = ctx->Attrs().Get<int>("InputChannel");
   int kernel_h = ctx->Attrs().Get<int>("KernelH");
   int kernel_w = ctx->Attrs().Get<int>("KernelW");
-  PADDLE_ENFORCE_EQ(w_dims[0], output_channel,
-                    "W dim[0] should be equal to OutputChannel");
+  PADDLE_ENFORCE_EQ(
+      w_dims[0], output_channel,
+      platform::errors::InvalidArgument(
+          "Input W's dimension[0] should be equal to OutputChannel, the "
+          "dimension[0] is %d, OutputChannel is %d.",
+          w_dims[0], output_channel));
   PADDLE_ENFORCE_EQ(
       w_dims[1], input_channel * kernel_h * kernel_w,
-      "W dim[1] should be equal to InputChannel * StrideH * StrideW");
+      platform::errors::InvalidArgument(
+          "Input W's dimension[1] should be equal to InputChannel * StrideH * "
+          "StrideW, the dimension[1] is %d, expected value is %d.",
+          w_dims[1], input_channel * kernel_h * kernel_w));
 
   if (ctx->IsRuntime()) {
     framework::Variable* x_var =
@@ -103,10 +117,14 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
         platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP "
                                           "does not contain LoD information."));
 
-    PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod[0].back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
+    PADDLE_ENFORCE_GE(x_lod.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info is corrupted."));
+    PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()),
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info mismatches the actual "
+                          "tensor shape, input lod is %s, tensor shape is %s.",
+                          x_lod, x_dims));
 
     framework::Variable* row_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]);
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index 742329abb2dae20437120c0d4ba5975d41b0a7c9..4460139219fb520d906b4a135d1b6f02d782ae2c 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -23,9 +23,15 @@
 #endif
 
 #include <cstring>
+
 #include "paddle/fluid/platform/hostdevice.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+namespace Eigen {
+template <typename T>
+struct NumTraits;
+}  // namespace Eigen
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
index bdb508ee33630004daae132fcdcf71146a50e640..fc964d7df359a3a07bfd515a80175adbcd372e48 100644
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -10,14 +10,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/bfloat16.h"
-
 #include <vector>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 4cb6ee3143a862f59a3c224a012d380e629e738f..54dac976276904809003903324535824819e979f 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -14,12 +14,8 @@
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-
-#include <memory>
 #include <utility>
 
-#include "paddle/fluid/platform/dynload/nccl.h"
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index b86fd70c9aecddca7c1ce23085a46c5332d2e698..e379832593c78c6ca2ad94bf72beaafded13a205 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 
 #ifdef PADDLE_WITH_XBYAK
-#include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
 #endif
 
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 67ea64833d3b844d88a2e5996f860ef165bd8ffd..a5dd19d4363d6a8fa99cf48ef2969186de605127 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -128,5 +128,150 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
 }
 
 #endif
+
+// For atomicMax
+USE_CUDA_ATOMIC(Max, int);
+USE_CUDA_ATOMIC(Max, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {
+  if (*address >= val) {
+    return;
+  }
+
+  unsigned long long int old = *address, assumed;
+
+  do {
+    assumed = old;
+    if (assumed >= val) {
+      break;
+    }
+
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
+
+CUDA_ATOMIC_WRAPPER(Max, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicMax(
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
+}
+
+CUDA_ATOMIC_WRAPPER(Max, float) {
+  if (*address >= val) {
+    return;
+  }
+
+  int *const address_as_i = (int *)address;
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+}
+
+CUDA_ATOMIC_WRAPPER(Max, double) {
+  if (*address >= val) {
+    return;
+  }
+
+  unsigned long long int *const address_as_ull =
+      (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+}
+
+// For atomicMin
+USE_CUDA_ATOMIC(Min, int);
+USE_CUDA_ATOMIC(Min, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {
+  if (*address <= val) {
+    return;
+  }
+
+  unsigned long long int old = *address, assumed;
+
+  do {
+    assumed = old;
+    if (assumed <= val) {
+      break;
+    }
+
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
+
+CUDA_ATOMIC_WRAPPER(Min, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicMin(
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
+}
+
+CUDA_ATOMIC_WRAPPER(Min, float) {
+  if (*address <= val) {
+    return;
+  }
+
+  int *const address_as_i = (int *)address;
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+}
+
+CUDA_ATOMIC_WRAPPER(Min, double) {
+  if (*address <= val) {
+    return;
+  }
+
+  unsigned long long int *const address_as_ull =
+      (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index 957bdf1e698d0aedb86c5b0cb732ab545c260bcc..a9382f2c8adcb18e320ef44086a312f89c03ad09 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -24,7 +24,11 @@ namespace platform {
 
 void CudaProfilerInit(std::string output_file, std::string output_mode,
                       std::string config_file) {
-  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
+                 platform::errors::InvalidArgument(
+                     "Unsupported cuda profiler output mode, expect `kvp` or "
+                     "`csv`, but received `%s`.",
+                     output_mode));
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
diff --git a/paddle/fluid/platform/cuda_resource_pool.h b/paddle/fluid/platform/cuda_resource_pool.h
index 22b53445d84c1f72481f559ff45c4864fb5c7263..570b68b08fc1e6fd0b9ff70dba244c355fc2618d 100644
--- a/paddle/fluid/platform/cuda_resource_pool.h
+++ b/paddle/fluid/platform/cuda_resource_pool.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/platform/resource_pool.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
index e0ba1aaa6bd8b8b29b0d02259d91b9bb9470e35f..0e0218dcca3fc4d7ea661fbcfe89d260a4c93a2d 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -22,8 +22,15 @@
 #include <numeric>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 using framework::Tensor;
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 4b9c5c429dabc32fad6f05e4f066ab063057e733..e983e36895353c215af19937980946a33c242b8c 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -23,6 +23,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_bool(cudnn_deterministic);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index 9d5a0954b00b1755a86cbd5d654b9a06edff4879..2474903edf77c1d0d22845fa44a23f82be323417 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device_code.h"
+
 #include <sys/stat.h>
 #include <algorithm>
 #include <set>
 #include <utility>
+
 #include "paddle/fluid/platform/enforce.h"
 
 DECLARE_string(cuda_dir);
diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h
index 6128d8b78db05e6ba92ffae46ee4c9cc8895c105..4199317a8ceb03dddc5cc439dc455725b8863678 100644
--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 28d94627f9575573075beaa328a682314b5c3b71..e1438a1eefa62b238241c3185daf69d3418f2dc9 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/memory/malloc.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_helper.h"
@@ -35,6 +36,7 @@ limitations under the License. */
 #endif
 
 #include <map>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -44,6 +46,11 @@ limitations under the License. */
 #define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
+namespace Eigen {
+struct DefaultDevice;
+struct GpuDevice;
+}  // namespace Eigen
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_header.h"
 #endif
@@ -118,8 +125,8 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 
 #ifdef PADDLE_WITH_CUDA
 
-class EigenCudaStreamDevice;
 class CudnnWorkspaceHandle;
+class EigenCudaStreamDevice;
 
 class CUDAContext {
  public:
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index 2c19a2b10626d0e312c267feba0104a23005909e..5cc33fd31f19cfc1512b1d41c5c9dc918f3c6491 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stddef.h>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 85168a046fb3fa4317956737871cde56e15bedfb..9bae7a870522cd1019a8bf0facb2cfb6e6772118 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -28,6 +28,8 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
+class Event;
+
 inline uint64_t PosixInNsec() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 562e7542012247c86add9e64f182d857ea969c60..045caab748811aac32f0ad3d20dfed1663a96f4a 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <mutex>  // NOLINT
 #include <type_traits>
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index 056fcc069dba927fec0f04784f9edfd4e44ef3d2..5799b084f5f3157fda90583b128dcdce4485cc83 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 1166dc5e4ad93fa23ef00623de6777b78b56ea09..4c59fe5e9bae4b751d87b0d2feb1ea0bd02bcf1d 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -30,6 +30,10 @@ CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
 CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 #endif
@@ -54,6 +58,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R8
+CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
+#endif
+
 bool HasCUDNN() {
   std::call_once(cudnn_dso_flag,
                  []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index fba41417648ba606727d00e71f48766f47479989..88b545b48e5328883c1554c0efc75140006ce787 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <glog/logging.h>
-
 #include <cudnn.h>
+#include <glog/logging.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
@@ -134,6 +134,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);          \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
   __macro(cudnnSetRNNDescriptor);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc
index a25660c6ed411bbe444ac8aa10a324cbed9c9d4f..d8381580c90d4ef9cd1bf5adbce7a733f81c91e0 100644
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI
 
 #include "paddle/fluid/platform/dynload/cupti.h"
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 48076e5478a30d86802e10f35bfa7cabfc0deab9..7a160664bc2e8973951892c23981fdd746b2123f 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,11 +14,10 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
-
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index ba2d38729c12e2c5fc978a2768cf36c3af3b7c4c..561f20af45ab5c38f18ed161cb2135589b71a932 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -15,11 +15,10 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusolverDn.h>
-
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 82e4f6ac75ec1e3cc927a4018b83616298eefbff..0c8a64ccf6943aa4359ed6ae094e12e4d6bccf8d 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
-#include <memory>
-#include <mutex>  // NOLINT
 #include <string>
 #include <vector>
 
@@ -22,7 +20,6 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 2be95b113b2f243b9813353877ddc61421b1a56c..9369cf131da9aba2f1c0e8b85441f4eba74996bc 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <mkl.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index 1d5fa45ecf684597be5a3d5234456a871221d329..407f34f0ac3e0e1d5244552834207b3ca28cbe89 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
 
 #include <nccl.h>
-
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index 9464a23ba1ef9f0b24c4ef727c2de8176149f166..720450d28b16f204cfc54b4e617adbe66997f539 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <nvrtc.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 67a79ce4bb1594afd23d960d18b75a8f0f1b2513..cb751071062c1eae8b42e6dcdfba04989480c512 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -35,7 +35,7 @@ extern void* tensorrt_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+    void* operator()(Args... args) {                                          \
       std::call_once(tensorrt_dso_flag, []() {                                \
         tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
       });                                                                     \
@@ -44,7 +44,8 @@ extern void* tensorrt_dso_handle;
         return nullptr;                                                       \
       }                                                                       \
       using tensorrt_func = decltype(&::__name);                              \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+      auto ret = reinterpret_cast<tensorrt_func>(p_##__name)(args...);        \
+      return static_cast<void*>(ret);                                         \
     }                                                                         \
   };                                                                          \
   extern DynLoad__##__name __name
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index bc1977b05de5da062fae5662dfb51d4a74868c8a..e10a7233b6248705492f641044771e6531fb00a4 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 #include "warpctc/include/ctc.h"
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index ce1ec507307a2721e641ac15425c6a2321e514c7..a3ae9e48eea30616770b6e6babcefc5026f22905 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -70,6 +70,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+namespace paddle {
+namespace platform {
+class ErrorSummary;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_int32(call_stack_level);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/errors_test.cc b/paddle/fluid/platform/errors_test.cc
index 3c84215b5e5900897a1c49a0f81428dd7beae5e7..a73c1ba3d34379df2e3999dd1a8519a5a9c792f9 100644
--- a/paddle/fluid/platform/errors_test.cc
+++ b/paddle/fluid/platform/errors_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <functional>
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index 261ec68483faf6ca7a34a641cd53cd2113381e9c..ec8a98eeb1a14d88a5aecea49140a3c2dd177b78 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -10,13 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 
-#include <vector>
-
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index 33d0fe6268046b3bcbd4addc75fcf34c03d70bf9..0be4233269e0f4f732c6b6a2622e7db3cb8e37e3 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/lodtensor_printer.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
+
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h
index e070e3540c996a0fe248a3b9312c18d948395426..e0bd1fff197f700303e3e7d9c9afe18937a1b9e6 100644
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ b/paddle/fluid/platform/lodtensor_printer.h
@@ -14,8 +14,15 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 void PrintVar(framework::Scope* scope, const std::string& var_name,
diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc
index 19e85284b8fc8842b2e5662343c74fc451b08d9e..5b2af270740766307990c5cd53585a9c62606da3 100644
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ b/paddle/fluid/platform/lodtensor_printer_test.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/lodtensor_printer.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
 
 TEST(LodTensorPrinter, PrintVar) {
   paddle::framework::Scope scope;
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index e65e09f45c0c809f126309ebc16915ab42e1ff0a..76554012bf51e34fc99db7759404f0e8d6f96cd6 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/monitor.h"
-#include <utility>
 
 namespace paddle {
 namespace platform {}  // namespace platform
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index 2f186420b414280c40ece763307e8e90c9a6d505..b57fae9daac41f37829309c4bc5f58fb2606ca02 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -22,6 +22,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
index 13f28c73f4504aea85d6155a3daa8f8f01b26385..41e084efa57004c3935e8a9f4200c1e5a4e8f664 100644
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/place.h"
-#include <sstream>
+
 #include "gtest/gtest.h"
 
 TEST(Place, Equality) {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 85759bc6e2ea3700df6a17f885385b85dfbcb6a3..56a6275b582d753c0fa579d624b7f50c16f579b8 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,28 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <iomanip>
 #include <limits>
-#include <map>
 #include <mutex>  // NOLINT
 #include <random>
-#include <stack>
 #include <string>
-#include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif  // PADDLE_WITH_CUDA
 
-#include "glog/logging.h"
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/errors.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler_helper.h"
-#include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
 
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 12049d815cf85dbd54d9188240dbfa5213e94447..0185328ff32004a35283c1d2883c2834b49b473b 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 4d340a06342822ffd44bac7ebc6114424d9bab60..2ce898d46171e190f546d8715a1f02845d62a848 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+
 #include <string>
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#endif
+
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 4272d5fd0b14c5d705d4af84691b3b8d24d1c7f5..c65d107cf454676777ef85bf557299ab3f07483c 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cstdint>
 #include <memory>
+
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
index ff0e1d95c2946b6db3ac0c05acba64ff5d3c59ef..09dcc4369beb06f11a429e4b9ffbae454a34f441 100644
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stdlib.h>
+
 #include "paddle/fluid/platform/port.h"
 
 #ifdef _WIN32
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 489dd198876204486fc94518fbef0c806d0543d4..da9900e2b271d08394cbc5e397f31b84e3b4d156 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -649,61 +649,47 @@ void BindImperative(py::module *m_ptr) {
              return self.NewVarBase(tensor.place(), false);
            },
            py::return_value_policy::copy, R"DOC(
-        **Notes**:
-            **This API is ONLY available in Dygraph mode**
 
-        Returns a new Variable, detached from the current graph.
-
-        Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+        Returns a new Tensor, detached from the current graph.
 
+        Returns: The detached Tensor.
 
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import Linear
-                import numpy as np
-
-                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
-                with fluid.dygraph.guard():
-                    linear = Linear(32, 64)
-                    data = to_variable(data)
-                    x = linear(data)
-                    y = x.detach()
+                import paddle
+                paddle.disable_static()
 
+                linear = Linear(32, 64)
+                data = paddle.uniform(shape=[30, 10, 32], -1, 1)
+                x = linear(data)
+                y = x.detach()
        )DOC")
       .def("clear_gradient", &imperative::VarBase::ClearGradient, R"DOC(
 
-        **Notes**:
-        **1. This API is ONLY available in Dygraph mode**
-
-        **2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
+        Only for Tensor that has gradient, normally we use this for Parameters since other temporary Tensor doesen't has gradient.
 
-        Clear  (set to ``0`` ) the Gradient of Current Variable
+        The Gradient of current Tensor will be set to ``0`` .
 
         Returns:  None
 
         Examples:
              .. code-block:: python
 
-                import paddle.fluid as fluid
-                import numpy as np
-
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                         tmp = fluid.dygraph.base.to_variable(x)
-                         tmp.stop_gradient=False
-                         inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    loss2.backward()
-                    print(loss2.gradient())
-                    loss2.clear_gradient()
-                    print("After clear {}".format(loss2.gradient()))
+                import paddle
+                paddle.disable_static()
+
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.ones([2, 2])
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs2)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
+                print("Before clear_gradient {}".format(loss.grad))
+                loss.clear_gradient()
+                print("After clear_gradient {}".format(loss.grad))
       )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
diff --git a/paddle/fluid/string/piece_test.cc b/paddle/fluid/string/piece_test.cc
index 80b712b08ccbcc3fc33e26f8c8c5b0531a71d974..544b5985ed21432488200768a28a3bae69f00a7f 100644
--- a/paddle/fluid/string/piece_test.cc
+++ b/paddle/fluid/string/piece_test.cc
@@ -14,8 +14,6 @@
 
 #include "paddle/fluid/string/piece.h"
 
-#include <sstream>
-
 #include "gtest/gtest.h"
 
 TEST(StringPiece, Construct) {
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index da4c1f326fbc2703e639279d79acb52dc748266a..5e2aedb22ad6899d046ffc8d11b0a2df2fca5ae3 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -18,6 +18,7 @@
 #include <sstream>
 #include <string>
 #include <utility>
+
 #include "paddle/fluid/string/printf.h"
 
 DECLARE_bool(color);
diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc
index 712db90d2f4ba62abdb0747827d78266503efcc5..8731e8fca8a5c4d8b13cf5fb6b38c5cf710225aa 100644
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/fluid/string/string_helper.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/string/string_helper.h"
+
 #include <ctype.h>
 #include <stdio.h>
 #include <cstring>
 #include <string>
-#include <vector>
-#include "boost/lexical_cast.hpp"
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index 8bf379a6b3423fed07d3ab670e0492a9316e8e19..499539226bd318d5ee297eccc81c13c7997bb5b9 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 235d92ac4f9e88947cea04425b0916b8a0290979..d587081fbac8a27df18bdacba3d94f6adcd3b171 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -26,7 +26,7 @@ function(train_test TARGET_NAME)
                     ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
         endif()
         set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES DEPENDS test_${TARGET_NAME})
+                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
         if(NOT WIN32 AND NOT APPLE)
             set_tests_properties(test_train_${TARGET_NAME}${arg}
                     PROPERTIES TIMEOUT 150)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 1087f5672459506cc7b824127cd822c0df7ba566..1ef98720f83697715c05e868177faba489fd8760 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -29,7 +29,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -70,7 +72,8 @@ int main() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // init all parameters
   executor.Run(*startup_program, &scope, 0);
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
index d45edd563f03d7a1b156d063d5e7296290d0eaba..a08069a57ca824f307b4bf8836237f573ab3c429 100644
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
@@ -45,7 +45,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -98,7 +100,11 @@ int main(int argc, char* argv[]) {
       file_vec.push_back(filename);
     }
   }
-  PADDLE_ENFORCE_GE(file_vec.size(), 1, "At least one file to train");
+  PADDLE_ENFORCE_GE(
+      file_vec.size(), 1,
+      platform::errors::InvalidArgument(
+          "At least one file to train, but received number of file is %d.",
+          file_vec.size()));
   paddle::framework::InitDevices(false);
   const auto cpu_place = paddle::platform::CPUPlace();
   paddle::framework::Executor executor(cpu_place);
@@ -148,7 +154,9 @@ int main(int argc, char* argv[]) {
     const std::vector<paddle::framework::DataFeed*> readers =
         dataset_ptr->GetReaders();
     PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      "readers num should be equal to thread num");
+                      platform::errors::InvalidArgument(
+                          "Readers num(%d) should be equal to thread num(1).",
+                          readers.size()));
     readers[0]->SetPlace(paddle::platform::CPUPlace());
     const std::vector<std::string>& input_feed_names =
         readers[0]->GetUseSlotAlias();
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index 45c438e8925b4e0a88e61ad509b88cd6226773a4..e7b698e1a34e267e392d696b67b92cd2e8c23f3b 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -51,7 +51,8 @@ void Train() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // prepare data
   auto x_var = scope.Var("img");
diff --git a/paddle/scripts/installation_validate.py b/paddle/scripts/installation_validate.py
index f84e2f4b176609dec28a8e29afea74d3654e9e4c..b765291a3b80fb3d5bf4e1331613eb3fd7b0dc79 100644
--- a/paddle/scripts/installation_validate.py
+++ b/paddle/scripts/installation_validate.py
@@ -15,4 +15,5 @@
 import paddle.fluid as fluid
 import paddle as pd
 
+fluid.install_check.run_check()
 print(pd.__version__)
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 60e4496bc545759173b68efbf85922efe8976fa4..7ad2787d181588f4f2facf5db72e033feef87989 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,6 +26,7 @@ wmic process where name="op_function_generator.exe" call terminate  2>NUL
 rem ------initialize common variable------
 if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
+if not defined TENSORRT_ROOT set TENSORRT_ROOT="C:/TensorRT-5.1.5.0"
 if not defined WITH_MKL set WITH_MKL=ON
 if not defined WITH_GPU set WITH_GPU=OFF
 if not defined WITH_AVX set WITH_AVX=ON
@@ -33,10 +34,13 @@ if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
+if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
 if not defined WITH_CACHE set WITH_CACHE=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=ON
 
+
 rem -------set cache build work directory-----------
+rmdir build\python /s/q
 if "%WITH_CACHE%"=="OFF" (
     rmdir build /s/q
     goto :mkbuild
@@ -45,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" (
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
 set day_before=-1
-set /p day_before=<day.txt
+set /p day_before=< %work_dir%\..\day.txt
 if %day_now% NEQ %day_before% (
-    echo %day_now% > day.txt
-    type day.txt
+    echo %day_now% > %work_dir%\..\day.txt
+    type %work_dir%\..\day.txt
     rmdir build /s/q
 )
 git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
@@ -99,6 +103,7 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
 clcache.exe -M 21474836480
 
+
 rem ------set cache third_party------
 set cache_dir=%work_dir:Paddle=cache%
 dir %cache_dir%
@@ -138,6 +143,7 @@ exit /b 1
 :CASE_wincheck_mkl
 set WITH_MKL=ON
 set WITH_GPU=OFF
+set MSVC_STATIC_CRT=ON
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -149,11 +155,13 @@ goto:success
 :CASE_wincheck_openblas
 set WITH_MKL=OFF
 set WITH_GPU=ON
+set MSVC_STATIC_CRT=OFF
 rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
 set WITH_INFERENCE_API_TEST=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
+:: call :test_inference || goto test_inference_error
 goto:success
 
 rem "Other configurations are added here"
@@ -172,12 +180,14 @@ set start=%start:~4,10%
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 
 cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 goto:eof
 
 :cmake_error
@@ -199,7 +209,7 @@ echo Build third_party the %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
-    if %build_times% GTR 3 (
+    if %build_times% GTR 2 (
         exit /b 7
     ) else (
         echo Build third_party failed, will retry!
@@ -214,7 +224,7 @@ echo Build Paddle the %build_times% time:
 msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
-    if %build_times% GTR 2 (
+    if %build_times% GTR 1 (
         exit /b 7
     ) else (
         echo Build Paddle failed, will retry!
@@ -282,7 +292,9 @@ dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
 dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
-set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
+%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
+%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
 ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
@@ -290,6 +302,7 @@ goto:eof
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "1 card TestCases Total"
 call :timestamp "%start%" "%end%" "TestCases Total"
 echo Running unit tests failed, will exit!
 exit /b 8
@@ -302,10 +315,11 @@ echo    ========================================
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "1 card TestCases Total"
 call :timestamp "%start%" "%end%" "TestCases Total"
 
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
@@ -334,6 +348,8 @@ echo     ============================================ >>  check_change_of_unitte
 echo EOF>>  check_change_of_unittest.sh
 echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
+echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh
+echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh
 echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
 echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
 echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
@@ -444,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
-call paddle_winci\Scripts\deactivate.bat 2>NUL
-del %PADDLE_WHL_FILE_WIN%
 taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index c19bd56fbbf4de343abea198674eaf8b545321f5..5400c55a0b150ecc0f534ed1bbb83aba236ec24b 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cstring>
-
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/init.h"
 
 int main(int argc, char** argv) {
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 54aa3edc51d3734633ce077a59bd86cec8d09032..d1e5975856515b1fc4f6aba67e8a110e3288cc33 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -22,7 +22,6 @@ import paddle.dataset.cifar
 import paddle.dataset.movielens
 import paddle.dataset.conll05
 import paddle.dataset.uci_housing
-import paddle.dataset.sentiment
 import paddle.dataset.wmt14
 import paddle.dataset.wmt16
 import paddle.dataset.mq2007
@@ -37,7 +36,6 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
deleted file mode 100644
index 721cb5a819282d5ef130de4d4596116326349d71..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/sentiment.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The script fetch and preprocess movie_reviews data set that provided by NLTK
-
-TODO(yuyang18): Complete dataset.
-"""
-
-from __future__ import print_function
-
-import six
-import collections
-from itertools import chain
-
-import os
-import nltk
-from nltk.corpus import movie_reviews
-import zipfile
-from functools import cmp_to_key
-
-import paddle.dataset.common
-
-URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
-MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
-
-__all__ = ['train', 'test', 'get_word_dict']
-NUM_TRAINING_INSTANCES = 1600
-NUM_TOTAL_INSTANCES = 2000
-
-
-def download_data_if_not_yet():
-    """
-    Download the data set, if the data set is not download.
-    """
-    try:
-        # download and extract movie_reviews.zip
-        paddle.dataset.common.download(
-            URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
-        path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
-        filename = os.path.join(path, 'movie_reviews.zip')
-        zip_file = zipfile.ZipFile(filename)
-        zip_file.extractall(path)
-        zip_file.close()
-        # make sure that nltk can find the data
-        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
-        movie_reviews.categories()
-    except LookupError:
-        print("Downloading movie_reviews data set, please wait.....")
-        nltk.download(
-            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print("Download data set success.....")
-        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
-
-
-def get_word_dict():
-    """
-    Sorted the words by the frequency of words which occur in sample
-    :return:
-        words_freq_sorted
-    """
-    words_freq_sorted = list()
-    word_freq_dict = collections.defaultdict(int)
-    download_data_if_not_yet()
-
-    for category in movie_reviews.categories():
-        for field in movie_reviews.fileids(category):
-            for words in movie_reviews.words(field):
-                word_freq_dict[words] += 1
-    words_sort_list = list(six.iteritems(word_freq_dict))
-    words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
-    for index, word in enumerate(words_sort_list):
-        words_freq_sorted.append((word[0], index))
-    return words_freq_sorted
-
-
-def sort_files():
-    """
-    Sorted the sample for cross reading the sample
-    :return:
-        files_list
-    """
-    files_list = list()
-    neg_file_list = movie_reviews.fileids('neg')
-    pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(
-        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
-    return files_list
-
-
-def load_sentiment_data():
-    """
-    Load the data set
-    :return:
-        data_set
-    """
-    data_set = list()
-    download_data_if_not_yet()
-    words_ids = dict(get_word_dict())
-    for sample_file in sort_files():
-        words_list = list()
-        category = 0 if 'neg' in sample_file else 1
-        for word in movie_reviews.words(sample_file):
-            words_list.append(words_ids[word.lower()])
-        data_set.append((words_list, category))
-    return data_set
-
-
-def reader_creator(data):
-    """
-    Reader creator, generate an iterator for data set
-    :param data:
-        train data set or test data set
-    """
-    for each in data:
-        yield each[0], each[1]
-
-
-def train():
-    """
-    Default training set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
-
-
-def test():
-    """
-    Default test set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
-
-
-def fetch():
-    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
deleted file mode 100644
index 3540ea06b075ed9b649af803c5a655a1e737723b..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import nltk
-import paddle.dataset.sentiment as st
-from nltk.corpus import movie_reviews
-
-
-class TestSentimentMethods(unittest.TestCase):
-    def test_get_word_dict(self):
-        word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
-                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
-        for idx, each in enumerate(word_dict):
-            self.assertEqual(each, test_word_list[idx])
-        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
-
-    def test_sort_files(self):
-        last_label = ''
-        for sample_file in st.sort_files():
-            current_label = sample_file.split("/")[0]
-            self.assertNotEqual(current_label, last_label)
-            last_label = current_label
-
-    def test_data_set(self):
-        data_set = st.load_sentiment_data()
-        last_label = -1
-
-        for each in st.test():
-            self.assertNotEqual(each[1], last_label)
-            last_label = each[1]
-
-        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
-        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
-        self.assertEqual(
-            len(list(st.test())),
-            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index f3ee09a6d9ec1b22171253a920b26bbf98afd36e..e89cb1f5ec497133985ef8069e77e864cef47f52 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # TODO: define distributed api under this directory,
-from .base.role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker
+from .base.role_maker import Role, UserDefinedRoleMaker, PaddleCloudRoleMaker
 from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
@@ -26,6 +26,7 @@ __all__ = [
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
     "Fleet",
+    "Role",
 ]
 
 fleet = Fleet()
@@ -39,8 +40,7 @@ server_num = fleet.server_num
 server_index = fleet.server_index
 server_endpoints = fleet.server_endpoints
 is_server = fleet.is_server
-set_util = fleet.set_util
-util = fleet.util
+util = UtilBase()
 barrier_worker = fleet.barrier_worker
 init_worker = fleet.init_worker
 init_server = fleet.init_server
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index f1c836468daf36db753c67a3e09757be728d37a7..316b6494e347ff0352e7faf0b607425c8cdd3b50 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -845,6 +845,29 @@ class DistributedStrategy(object):
         check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs")
         assign_configs_value(self.strategy.dgc_configs, configs)
 
+    @property
+    def fp16_allreduce(self):
+        """
+        Indicating whether we are using fp16 gradient allreduce training
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fp16_allreduce = True # by default this is false
+
+        """
+        return self.strategy.fp16_allreduce
+
+    @fp16_allreduce.setter
+    @is_strict_auto
+    def fp16_allreduce(self, flag):
+        if not isinstance(flag, bool):
+            raise TypeError('fp16_allreduce must be value of bool type')
+        self.strategy.fp16_allreduce = flag
+
     @property
     def gradient_merge(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index d00faac838504f5d68e9d44d9ffa9f25c7bf2ee5..d0658efdca34ec5a10a0e2f07dabc357096bfad5 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -23,7 +23,6 @@ from .strategy_compiler import StrategyCompiler
 from .distributed_strategy import DistributedStrategy
 from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
-from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 from paddle.fluid.dygraph import parallel_helper
 
@@ -120,7 +119,6 @@ class Fleet(object):
         self.strategy_compiler = None
         self._is_collective = False
         self._runtime_handle = None
-        self._util = None
 
     def init(self, role_maker=None, is_collective=False):
         """
@@ -182,6 +180,9 @@ class Fleet(object):
                     format(type(role_maker)))
         self._role_maker._generate_role()
 
+        import paddle.distributed.fleet as fleet
+        fleet.util._set_role_maker(self._role_maker)
+
         self.strategy_compiler = StrategyCompiler()
         if paddle.fluid.framework.in_dygraph_mode():
             if parallel_helper._is_parallel_ctx_initialized():
@@ -353,29 +354,6 @@ class Fleet(object):
         return self._role_maker._is_server(
         ) or self._role_maker._is_heter_worker()
 
-    def set_util(self, util):
-        self._util = util
-
-    def util(self):
-        """
-        Utility functions that can be used under certain runtime
-        return util
-
-        Returns:
-            UtilBase: instance of UtilBase, can use distributed ops/tools easily.
-
-        Examples:
-
-            .. code-block:: python
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                util = fleet.util
-                files = ["1.log", "2.log", "3.log", "4.log"]
-                files = util.get_file_shard()
-
-        """
-        return self._util
-
     def barrier_worker(self):
         """
         barrier all workers
@@ -1102,7 +1080,7 @@ class Fleet(object):
         if self._runtime_handle is None:
             self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
-        if self._util is None:
-            self._util = UtilFactory()._create_util(context)
+        import paddle.distributed.fleet as fleet
+        fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index aab0a6f3cd3f3e31dcbedd30711ba8f62824021b..bfb5ace508c91f2c4dd1ced18ba5372e0e37e1ea 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -598,7 +598,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         """
         if not self._role_is_generated:
             self._generate_role()
-        return len(self._get_pserver_endpoints())
+        return len(self._get_pserver_endpoints(
+        )) if self._get_pserver_endpoints() is not None else 0
 
     def _node_num(self):
         """
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index efaa854c0879ddb57c7746cede68047ff82931a0..bbb7d60ed9c79f4caae5a0eb7427679bc5c7848e 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -73,11 +73,13 @@ class UtilBase(object):
             .. code-block:: python
 
                 # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
-                from paddle.distributed.fleet.base.util_factory import fleet_util
                 import paddle.distributed.fleet as fleet
                 from paddle.distributed.fleet import PaddleCloudRoleMaker
                 import sys
                 import numpy as np
+                import os
+
+                os.environ["PADDLE_WITH_GLOO"] = "2"
 
                 def train():
                     role = PaddleCloudRoleMaker(
@@ -85,19 +87,18 @@ class UtilBase(object):
                         init_gloo=True,
                         path="./tmp_gloo")
                     fleet.init(role)
-                    fleet_util._set_role_maker(role)
 
                     if fleet.is_server():
                         input = [1, 2]
-                        output = fleet_util.all_reduce(input, "sum", "server")
+                        output = fleet.util.all_reduce(input, "sum", "server")
                         print(output)
                         # [2, 4]
                     elif fleet.is_worker():
                         input = np.array([3, 4])
-                        output = fleet_util.all_reduce(input, "sum", "worker")
+                        output = fleet.util.all_reduce(input, "sum", "worker")
                         print(output)
                         # [6, 8]
-                    output = fleet_util.all_reduce(input, "sum", "all")
+                    output = fleet.util.all_reduce(input, "sum", "all")
                     print(output)
                     # [8, 12]
                 if __name__ == "__main__":
@@ -117,10 +118,12 @@ class UtilBase(object):
             .. code-block:: python
                 # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
 
-                from paddle.distributed.fleet.base.util_factory import fleet_util
                 import paddle.distributed.fleet as fleet
                 from paddle.distributed.fleet import PaddleCloudRoleMaker
                 import sys
+                import os
+
+                os.environ["PADDLE_WITH_GLOO"] = "2"
 
                 def train():
                     role = PaddleCloudRoleMaker(
@@ -128,15 +131,14 @@ class UtilBase(object):
                         init_gloo=True,
                         path="./tmp_gloo")
                     fleet.init(role)
-                    fleet_util._set_role_maker(role)
 
                     if fleet.is_server():
-                        fleet_util.barrier("server")
+                        fleet.util.barrier("server")
                         print("all server arrive here")
                     elif fleet.is_worker():
-                        fleet_util.barrier("worker")
+                        fleet.util.barrier("worker")
                         print("all server arrive here")
-                    fleet_util.barrier("all")
+                    fleet.util.barrier("all")
                     print("all servers and workers arrive here")
 
                 if __name__ == "__main__":
@@ -160,10 +162,12 @@ class UtilBase(object):
             .. code-block:: python
 
                 # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
-                from paddle.distributed.fleet.base.util_factory import fleet_util
                 import paddle.distributed.fleet as fleet
                 from paddle.distributed.fleet import PaddleCloudRoleMaker
                 import sys
+                import os
+
+                os.environ["PADDLE_WITH_GLOO"] = "2"
 
                 def train():
                     role = PaddleCloudRoleMaker(
@@ -171,19 +175,18 @@ class UtilBase(object):
                         init_gloo=True,
                         path="./tmp_gloo")
                     fleet.init(role)
-                    fleet_util._set_role_maker(role)
 
                     if fleet.is_server():
                         input = fleet.server_index()
-                        output = fleet_util.all_gather(input, "server")
+                        output = fleet.util.all_gather(input, "server")
                         print(output)
                         # output = [0, 1]
                     elif fleet.is_worker():
                         input = fleet.worker_index()
-                        output = fleet_util.all_gather(input, "worker")
+                        output = fleet.util.all_gather(input, "worker")
                         # output = [0, 1]
                         print(output)
-                    output = fleet_util.all_gather(input, "all")
+                    output = fleet.util.all_gather(input, "all")
                     print(output)
                     # output = [0, 1, 0, 1]
 
@@ -220,18 +223,20 @@ class UtilBase(object):
 
             .. code-block:: python
 
-                from paddle.distributed.fleet.base.util_factory import fleet_util
-                import paddle.distributed.fleet.base.role_maker as role_maker
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import UserDefinedRoleMaker
 
-                role = role_maker.UserDefinedRoleMaker(
+                role = UserDefinedRoleMaker(
                     is_collective=False,
                     init_gloo=False,
                     current_id=0,
-                    role=role_maker.Role.WORKER,
+                    role=fleet.Role.WORKER,
                     worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
                     server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-                fleet_util._set_role_maker(role)
-                files = fleet_util.get_file_shard(["file1", "file2", "file3"])
+                fleet.init(role)
+
+                files = fleet.util.get_file_shard(["file1", "file2", "file3"])
+                print(files)
                 # files = ["file1", "file2"]
         """
         if not isinstance(files, list):
@@ -267,18 +272,19 @@ class UtilBase(object):
 
             .. code-block:: python
 
-                from paddle.distributed.fleet.base.util_factory import fleet_util
-                import paddle.distributed.fleet.base.role_maker as role_maker
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import UserDefinedRoleMaker
 
-                role = role_maker.UserDefinedRoleMaker(
+                role = UserDefinedRoleMaker(
                     is_collective=False,
                     init_gloo=False,
                     current_id=0,
-                    role=role_maker.Role.WORKER,
+                    role=fleet.Role.WORKER,
                     worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
                     server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-                fleet_util._set_role_maker(role)
-                fleet_util.print_on_rank("I'm worker 0", 0)
+                fleet.init(role)
+
+                fleet.util.print_on_rank("I'm worker 0", 0)
         """
         if self.role_maker._worker_index() != rank_id:
             return
@@ -577,6 +583,3 @@ class UtilBase(object):
                 print("fetch_targets name: %s" % v.name)
                 print("fetch_targets: {}".format(results[i]))
             return results
-
-
-fleet_util = UtilFactory()._create_util(None)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index a3a2dee70387d69b9e8e09cd86d69a76890d7a1f..2e63e82e630cc58cae4405986f1ebf770a5dc9f3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer
 from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
+from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..411980ed01322ac56813efcd0684bb12e9c8761b
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
@@ -0,0 +1,146 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle.fluid import core, framework, unique_name
+from .meta_optimizer_base import MetaOptimizerBase
+
+
+class FP16AllReduceOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(FP16AllReduceOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "RecomputeOptimizer",
+            "LocalSGDOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["DGCOptimizer"]
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(FP16AllReduceOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.user_defined_strategy.fp16_allreduce:
+            return True
+
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.fp16_allreduce = False
+
+    def _enable_strategy(self, dist_strategy, context=None):
+        dist_strategy.fp16_allreduce = True
+
+    @staticmethod
+    def fp16_compression(param_and_grads):
+        """
+        Compress fp32 gradients to fp16 during allreduce.
+        """
+        op_maker = core.op_proto_and_checker_maker
+
+        new_param_and_grads = []  # param, grad, is_cast
+        # cast grad from fp32->fp16 before allreduce,
+        for param, grad in param_and_grads:
+            if grad is None or grad.dtype != core.VarDesc.VarType.FP32:
+                new_param_and_grads.append((param, grad, False))
+                continue
+
+            op = grad.op
+            block = grad.block
+            var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
+            if param.name not in var_attr:
+                new_param_and_grads.append((param, grad, False))
+                continue
+
+            # remove (param, grad) from op_role_var
+            var_attr.remove(param.name)
+            var_attr.remove(grad.name)
+            if len(var_attr) > 1:
+                op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
+            else:
+                op._remove_attr(op_maker.kOpRoleVarAttrName())
+
+            new_grad = block.create_var(
+                name=unique_name.generate(grad.name + ".cast_fp16"),
+                dtype=core.VarDesc.VarType.FP16,
+                persistable=False,
+                stop_gradient=True)
+
+            with block.program._backward_role_guard():
+                cast_op = block.append_op(
+                    type="cast",
+                    inputs={"X": grad},
+                    outputs={"Out": new_grad},
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.FP32,
+                        "out_dtype": core.VarDesc.VarType.FP16
+                    },
+                    stop_gradient=True)
+
+                backward = op_maker.OpRole.Backward
+                cast_op._set_attr(op_maker.kOpRoleAttrName(), backward)
+                cast_op._set_attr(op_maker.kOpRoleVarAttrName(),
+                                  [param.name, new_grad.name])
+                new_grad.op = cast_op
+
+            new_param_and_grads.append((param, new_grad, True))
+
+        ret_param_and_grads = []
+        # cast grad from fp16->fp32 after allreduce.
+        # NOTE. Now we split fp16 compression into two for loops,
+        # if we do not separate them, fuse allreduce will wrong.
+        # This must be the problem of fuse allreduce pass, need
+        # fixed in future.
+        for param, grad, cast in new_param_and_grads:
+            if not cast:
+                ret_param_and_grads.append((param, grad))
+                continue
+
+            block = grad.block
+            new_grad = block.create_var(
+                name=unique_name.generate(grad.name + ".cast_fp32"),
+                dtype=core.VarDesc.VarType.FP32,
+                persistable=False,
+                stop_gradient=True)
+
+            with block.program._optimized_guard(
+                [param, grad]), framework.name_scope('fp16_allreduce'):
+                cast_op = block.append_op(
+                    type="cast",
+                    inputs={"X": grad},
+                    outputs={"Out": new_grad},
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.FP16,
+                        "out_dtype": core.VarDesc.VarType.FP32
+                    },
+                    stop_gradient=True)
+            ret_param_and_grads.append((param, new_grad))
+
+        return ret_param_and_grads
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        new_params_grads = self.fp16_compression(params_grads)
+        return self.inner_opt.apply_optimize(
+            loss,
+            startup_program=startup_program,
+            params_grads=new_params_grads)
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 5127e45ee17f786d9a7fceacc02288deecdd4662..e4f7d34c3cdd5607ac93ae1234cd4757211703f5 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -225,12 +225,12 @@ class ParameterServerRuntime(RuntimeBase):
         else:
             model_dirname = None
 
-        if self.role_maker._is_heter_worker():
-            self._init_worker()
-
         executor = self._get_executor()
         executor.run(fluid.default_startup_program())
 
+        if self.role_maker._is_heter_worker():
+            self._init_worker()
+
         if self.role_maker._is_heter_worker():
             return
 
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index abf198b97e6e818e1fbe59006f98492640bcee54..a45e1682c3fad46ce62274ae14577cb7fd6b89b8 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .fs import LocalFS, HDFSClient
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index b7c50bda3eae0a004bd3d0169fc62260393b28b8..8d4f24fb116cccacbc88aa73ffbaeb559e1fdc06 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -120,7 +120,7 @@ class LocalFS(FS):
     Examples:
         .. code-block:: python
 
-            from paddle.distributed.fleet.utils.fs import LocalFS
+            from paddle.distributed.fleet.utils import LocalFS
 
             client = LocalFS()
             subdirs, files = client.ls_dir("./")
@@ -140,7 +140,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 subdirs, files = client.ls_dir("./")
@@ -160,7 +160,7 @@ class LocalFS(FS):
 
     def mkdirs(self, fs_path):
         """
-        Create a remote HDFS directory.
+        Create a local directory.
 
         Args:
             fs_path(str): The local directory path.
@@ -168,7 +168,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.mkdirs("test_mkdirs")
@@ -189,7 +189,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_rename_src")
@@ -217,7 +217,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.mkdirs("test_localFS_mkdirs")
@@ -247,7 +247,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_is_file")
@@ -269,7 +269,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.mkdirs("test_is_dir")
@@ -292,7 +292,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 ret = local_fs.is_exist("test_is_exist")
@@ -311,7 +311,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_touch")
@@ -332,13 +332,11 @@ class LocalFS(FS):
             src_path(str):  Name of the file or directory, that's needed to be moved.
             dst_path(str):  Name of the file or directory to which to move to.
             overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `src_path` and `dst_path` . 
-            When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption. 
 
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_mv_src")
@@ -369,7 +367,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 subdirs = client.list_dirs("./")
@@ -432,7 +430,7 @@ class HDFSClient(FS):
 
         .. code-block:: text
 
-            from paddle.distributed.fleet.utils.fs import HDFSClient
+            from paddle.distributed.fleet.utils import HDFSClient
             hadoop_home = "/home/client/hadoop-client/hadoop/"
 
             configs = {
@@ -493,7 +491,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -526,7 +524,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -587,7 +585,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -629,7 +627,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -661,7 +659,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -695,7 +693,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -740,7 +738,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -784,7 +782,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -830,7 +828,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -893,7 +891,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -919,12 +917,14 @@ class HDFSClient(FS):
 
         Args:
             fs_path(str): The HDFS file path.
+            exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
+            program will throw an Exception. Default is true.
 
         Examples:
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 7b301ac19d1d3dc1f4aabb6cf3af2f0874faa677..04e4906868e856b84bbc13ee60ca6a0b231cfad3 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -590,7 +590,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
         global_norm_var = layers.reduce_sum(global_norm_var)
         global_norm_var = layers.sqrt(global_norm_var)
         max_global_norm = layers.fill_constant(
-            shape=[1], dtype='float32', value=self.clip_norm)
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
         clip_var = layers.elementwise_div(
             x=max_global_norm,
             y=layers.elementwise_max(
@@ -635,7 +635,9 @@ class GradientClipByGlobalNorm(GradientClipBase):
                 global_norm_var = layers.sums(sum_square_list)
                 global_norm_var = layers.sqrt(x=global_norm_var)
                 max_global_norm = layers.fill_constant(
-                    shape=[1], dtype="float32", value=self.clip_norm)
+                    shape=[1],
+                    dtype=global_norm_var.dtype,
+                    value=self.clip_norm)
                 scale_var = layers.elementwise_div(
                     x=max_global_norm,
                     y=layers.elementwise_max(
@@ -663,7 +665,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
             context[self.group_name] = []
             context[self.group_name + "_clip_value"] = self.clip_norm
             context[self.group_name + "_clip"] = layers.fill_constant(
-                shape=[1], dtype="float32", value=self.clip_norm)
+                shape=[1], dtype=grad.dtype, value=self.clip_norm)
         else:
             if not self.clip_norm == context[self.group_name + "_clip_value"]:
                 raise ValueError(
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 7b564b3f837c001673bdd272ba60edf31cde21fb..ac6493b1c2969a8c3319bc8d29983b0ccc3a67d9 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 
 from paddle.fluid import core
+from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry
 
 from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_
@@ -57,7 +58,7 @@ __all__ = [
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
     'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
-    'correlation'
+    'correlation', 'fused_bn_add_act'
 ]
 
 
@@ -1625,3 +1626,191 @@ def correlation(x,
             },
             outputs={"Output": output})
     return output
+
+
+def fused_bn_add_act(x,
+                     y,
+                     momentum=0.9,
+                     epsilon=1e-05,
+                     param_attr=None,
+                     bias_attr=None,
+                     moving_mean_name=None,
+                     moving_variance_name=None,
+                     act=None,
+                     name=None):
+    """
+    This Op performs batch norm on input x, and adds the result to input y. Then
+    it performs activation on the sum. The data format of inputs must be NHWC
+    `[batch, in_height, in_width, in_channels]`.
+
+    Args:
+        x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        momentum(float|Tensor, optional): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+	        If the Initializer of the param_attr is not set, the parameter is initialized
+	        with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	        If the Initializer of the bias_attr is not set, the bias is initialized zero.
+	        Default: None.
+        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string.
+        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string.
+        act(string, optional): Activation type, linear|relu|prelu|...
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
+    Examples:
+            .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            def build_program(main_program, startup_program):
+                with fluid.program_guard(main_program, startup_program):
+                    x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+                    y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+                    conv1_1 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    conv1_2 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    bn = fluid.layers.batch_norm(
+                        input=conv1_1,
+                        act=None,
+                        data_layout='NHWC')
+                    fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(conv1_2, bn)
+                    prediction = fluid.layers.fc(input=fused_bn_add_act, size=10, act='softmax')
+                    loss = fluid.layers.cross_entropy(input=prediction, label=y)
+                    loss = fluid.layers.mean(loss)
+                    sgd = fluid.optimizer.SGD(learning_rate=0.001)
+                    sgd = fluid.contrib.mixed_precision.decorate(
+                        sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+                    sgd.minimize(loss)
+
+                return x, y, loss
+
+            iters = 5
+            batch_size = 16
+            support_gpu = fluid.is_compiled_with_cuda()
+            if support_gpu:
+                main_program = fluid.Program()
+                startup_program = fluid.Program()
+                place = fluid.CUDAPlace(0)
+                x, y, loss = build_program(main_program, startup_program)
+  
+                feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+                train_reader = paddle.batch(
+                    paddle.dataset.mnist.train(), batch_size=batch_size)
+                exe = fluid.Executor(place)
+                scope = fluid.Scope()
+                with fluid.scope_guard(scope):
+                    exe.run(startup_program)
+                    for _ in range(iters):
+                        data = next(train_reader())
+                        loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
+    """
+    helper = LayerHelper('fused_bn_add_act', **locals())
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    check_variable_and_dtype(y, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    bn_param_dtype = core.VarDesc.VarType.FP32
+
+    x_shape = x.shape
+    channel_num = x_shape[-1]
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        default_initializer=Constant(1.0))
+    bias = helper.create_parameter(
+        attr=helper.bias_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        is_bias=True)
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    mean.stop_gradient = True
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    reserve_space = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(
+        core.VarDesc.VarType.FP16)
+
+    inputs = {
+        "X": x,
+        "Z": y,
+        "Scale": scale,
+        "Bias": bias,
+    }
+    attrs = {"epsilon": epsilon, 'momentum': momentum}
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance,
+        "ReserveSpace": reserve_space
+    }
+
+    helper.append_op(
+        type="fused_bn_add_activation",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+
+    return batch_norm_out
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 1f301b7148d005d4e3d5d272fd78f78af6dc1e6a..a9f080c514dff078b0068bce262fa177fd0b0db2 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -135,6 +135,7 @@ gray_list = {
     'get_tensor_from_selected_rows',
     'sign',
     'cast',
+    'fused_bn_add_activation',
 }
 '''
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 0b142ff33de55f36410eb9c23cb75210fc9d6321..0ff166d8dc89ac79c36343df9bc379cb171c36fd 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -69,8 +69,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
     ]
 
     for in_name in op.input_names:
-        if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm':
-            if in_name != 'X':
+        if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm', 'fused_bn_add_activation'
+        ]:
+            if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
             in_var = block.var(in_var_name)
@@ -102,7 +104,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                     op._set_attr('in_dtype', dest_dtype)
     if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
         for out_name in op.output_names:
-            if op.type == 'batch_norm' and out_name != 'Y':
+            if op.type in ['batch_norm', 'fused_bn_add_activation'
+                           ] and out_name != 'Y':
                 continue
             for out_var_name in op.output(out_name):
                 out_var = block.var(out_var_name)
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 6ac005060e0b21d88f17619bbe88b7a56c23fdb8..dd4bea06572fb5fdec70047fc0df58d8287ad390 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -270,7 +270,7 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 # setting timeout value for old unittests
-if(NOT WIN32 AND NOT APPLE)
+if(NOT WIN32)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
 	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
 endif()
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 906d83fff4fd61390a68133170cb1c43f6b74251..6000a44ceb65972ad5c31f328453bf346a3d3916 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import six
 import unittest
 from functools import partial
 import numpy as np
@@ -24,6 +25,24 @@ import contextlib
 paddle.enable_static()
 
 
+def fake_imdb_reader(word_dict_size,
+                     sample_num,
+                     lower_seq_len=100,
+                     upper_seq_len=200,
+                     class_dim=2):
+    def __reader__():
+        for _ in six.moves.range(sample_num):
+            length = np.random.random_integers(
+                low=lower_seq_len, high=upper_seq_len, size=[1])[0]
+            ids = np.random.random_integers(
+                low=0, high=word_dict_size - 1, size=[length]).astype('int64')
+            label = np.random.random_integers(
+                low=0, high=class_dim - 1, size=[1]).astype('int64')[0]
+            yield ids, label
+
+    return __reader__
+
+
 def get_places():
     places = [fluid.CPUPlace()]
     if fluid.core.is_compiled_with_cuda():
@@ -68,10 +87,11 @@ def bow_net(data,
 
 class TestWeightDecay(unittest.TestCase):
     def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
-        reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=2)()
-        self.train_data = [next(reader) for _ in range(5)]
+        self.word_dict_len = 5147
+        batch_size = 2
+        reader = fake_imdb_reader(self.word_dict_len, batch_size * 100)
+        reader = paddle.batch(reader, batch_size=batch_size)()
+        self.train_data = [next(reader) for _ in range(3)]
         self.learning_rate = .5
 
     def run_program(self, place, feed_list):
@@ -103,7 +123,7 @@ class TestWeightDecay(unittest.TestCase):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            avg_cost = model(data, label, len(self.word_dict))
+            avg_cost = model(data, label, self.word_dict_len)
             AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
                 fluid.optimizer.Adam)
 
@@ -127,7 +147,7 @@ class TestWeightDecay(unittest.TestCase):
                 name="words", shape=[1], dtype="int64", lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-            avg_cost = model(data, label, len(self.word_dict))
+            avg_cost = model(data, label, self.word_dict_len)
 
             param_list = [(var, var * self.learning_rate)
                           for var in main_prog.block(0).all_parameters()]
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index dc57e9f71ed3d0de1a374bdf719b32a083198b31..05ea66f54451ba08032bff4e7bc805bbffa15e73 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -19,10 +19,12 @@ from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_dtype, check_type
 from ..utils import deprecated
+from paddle.fluid.framework import static_only
 
 __all__ = ['data']
 
 
+@static_only
 @deprecated(since="2.0.0", update_to="paddle.static.data")
 def data(name, shape, dtype='float32', lod_level=0):
     """
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 93cb0bafc847b897816636f92255bd06b7e67321..f4ea4d670e6006c86cd9ca8a15723e5a62e5c0d9 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -145,7 +145,7 @@ def load_dygraph(model_path, config=None):
 
     .. note::
         Due to some historical reasons, if you load ``state_dict`` from the saved 
-        result of `paddle.io.save_inference_model`, the structured variable name 
+        result of `paddle.static.save_inference_model`, the structured variable name 
         will cannot be restored. You need to set the argument `use_structured_name=False` 
         when using `Layer.set_state_dict` later.
 
@@ -164,24 +164,24 @@ def load_dygraph(model_path, config=None):
         .. code-block:: python
 
             import paddle
-            
+            import paddle.fluid as fluid
+
             paddle.disable_static()
 
-            emb = paddle.nn.Embedding([10, 10])
+            emb = paddle.nn.Embedding(10, 10)
 
             state_dict = emb.state_dict()
-            paddle.save(state_dict, "paddle_dy")
+            fluid.save_dygraph(state_dict, "paddle_dy")
 
-            scheduler = paddle.optimizer.lr_scheduler.NoamLR(
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             state_dict = adam.state_dict()
-            paddle.save(state_dict, "paddle_dy")
-
-            para_state_dict, opti_state_dict = paddle.load("paddle_dy")
+            fluid.save_dygraph(state_dict, "paddle_dy")
 
+            para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
     '''
     # deal with argument `model_path`
     model_prefix = model_path
@@ -275,7 +275,7 @@ def load_dygraph(model_path, config=None):
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
-            # paddle.io.load_program_state in this case
+            # paddle.static.load_program_state in this case
 
             # Try to load all the files in the directory in VarBase format, 
             # the file name is used as the name of VarBase
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index c837c8eb123c2707d89a75a7489607f43a2e7501..908587c0d9c3112b14d6fe0efadcb673ecd49827 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -29,7 +29,7 @@ import six
 
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticLayer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import unwrap_decorators
 from paddle.fluid.dygraph.layers import Layer
@@ -143,14 +143,14 @@ def convert_call(func):
             #      def foo(x):
             #          return x
             #
-            # `foo` will be converted into a wrapper class, suppose as `StaticLayer`.
-            # And `foo.__globals__['foo']` will still return this `StaticLayer` instead of
-            # `foo` function. So `isinstance(fn, StaticLayer)` is added here. 
+            # `foo` will be converted into a wrapper class, suppose as `StaticFunction`.
+            # And `foo.__globals__['foo']` will still return this `StaticFunction` instead of
+            # `foo` function. So `isinstance(fn, StaticFunction)` is added here. 
             global_functions = set()
             for fn in func.__globals__.values():
                 if inspect.isfunction(fn):
                     global_functions.add(fn)
-                elif isinstance(fn, StaticLayer):
+                elif isinstance(fn, StaticFunction):
                     _, fn = unwrap_decorators(fn)
                     global_functions.add(fn)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 3b3b9bbe96f2929257d99b924af9770605b287f4..ddf44d805d1bfe126573ea76caae9d6bfe542bad 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -205,7 +205,7 @@ def unwrap_decorators(func):
     decorators = []
     cur = func
     while True:
-        if isinstance(cur, StaticLayer):
+        if isinstance(cur, StaticFunction):
             decorators.append(cur)
             # Note: if `cur` is a method, keep it as bound method of class.
             instance = cur._class_instance
@@ -218,7 +218,7 @@ def unwrap_decorators(func):
     return decorators, cur
 
 
-class StaticLayer(object):
+class StaticFunction(object):
     """
     Wrapper class to Manage program conversion of decorated function.
 
@@ -226,7 +226,7 @@ class StaticLayer(object):
 
     def __init__(self, function, input_spec=None):
         """
-        Initializes a `StaticLayer`.
+        Initializes a `StaticFunction`.
 
         Args:
             function(callable): A function or method that will be converted into static program.
@@ -268,12 +268,12 @@ class StaticLayer(object):
         
         In above case, `net(x, y)` will call `net.forward(x, y)` firstly that is a bound method
         of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
-        to parse the class instance correctly instead of the `StaticLayer` instance.
+        to parse the class instance correctly instead of the `StaticFunction` instance.
         """
         if instance not in self._descriptor_cache:
             if instance is None:
                 return self
-            # Note(Aurelius84): To construct new instance of StaticLayer when we
+            # Note(Aurelius84): To construct new instance of StaticFunction when we
             # first encouter the bound function of layer and cache it.
             new_static_layer = self._clone()
             new_static_layer._class_instance = instance
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 10819e4b320dd0630c7ac43fdf89b84252823a94..194ebafb08eefde4b08e503786f06c6dd06caa3b 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -28,7 +28,7 @@ from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticFunction, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
@@ -141,7 +141,7 @@ def copy_decorator_attrs(original_func, decorated_obj):
 
     Args:
         original_func(callable): the original decorated function.
-        decorated_obj(StaticLayer): the target decorated StaticLayer object.
+        decorated_obj(StaticFunction): the target decorated StaticFunction object.
     """
     decorator_name = "declarative"
 
@@ -198,7 +198,7 @@ def declarative(function=None, input_spec=None):
 
     def decorated(python_func):
         """
-        Decorates a python function into a StaticLayer object.
+        Decorates a python function into a StaticFunction object.
         """
         # Step 1. unwrap the function if it is already decorated.
         _, python_func = unwrap_decorators(python_func)
@@ -206,7 +206,7 @@ def declarative(function=None, input_spec=None):
         # Step 2. copy some attributes from original python function.
         static_layer = copy_decorator_attrs(
             original_func=python_func,
-            decorated_obj=StaticLayer(
+            decorated_obj=StaticFunction(
                 function=python_func, input_spec=input_spec))
 
         return static_layer
@@ -214,7 +214,7 @@ def declarative(function=None, input_spec=None):
     # for usage: `declarative(foo, ...)`
     if function is not None:
         if isinstance(function, Layer):
-            if isinstance(function.forward, StaticLayer):
+            if isinstance(function.forward, StaticFunction):
                 class_name = function.__class__.__name__
                 logging_utils.warn(
                     "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
@@ -231,9 +231,7 @@ def declarative(function=None, input_spec=None):
 class SaveLoadConfig(object):
     """
     The additional configuration options may be used in function 
-    :ref:`api_imperative_jit_save` that save :ref:`api_imperative_TranslatedLayer` 
-    or used in function :ref:`api_imperative_jit_load` that 
-    load :ref:`api_imperative_TranslatedLayer` .
+    ``paddle.jit.save/load`` and ``paddle.load`` .
     
     Examples:
         1. Using ``SaveLoadConfig`` when saving model
@@ -319,7 +317,7 @@ class SaveLoadConfig(object):
     @property
     def output_spec(self):
         """
-        Selects the output targets of the saved model ( :ref:`api_imperative_TranslatedLayer` ).
+        Selects the output targets of the saved model ( ``paddle.jit.TranslatedLayer`` ).
         By default, all return variables of original Layer's forward function
         are kept as the output of the saved TranslatedLayer.
 
@@ -531,11 +529,14 @@ class SaveLoadConfig(object):
     def separate_params(self):
         """
         Configure whether to save the Layer parameters as separete files.
-        (In order to be compatible with the behavior of :ref:`api_fluid_io_save_inference_model` )
+        (In order to be compatible with the behavior of ``paddle.static.save_inference_model`` )
 
         If True, each parameter will be saved to a file separately, the file name is the parameter name,
         and the SaveLoadConfig.params_filename configuration will not take effect. Default False.
 
+        .. note::
+            Only used for ``paddle.jit.save`` .
+
         Examples:
             .. code-block:: python
 
@@ -569,7 +570,7 @@ class SaveLoadConfig(object):
                     adam.clear_grad()
 
                 model_path = "simplenet.example.model.separate_params"
-                config = paddle.jit.SaveLoadConfig()
+                config = paddle.SaveLoadConfig()
                 config.separate_params = True
 
                 # saving with configs.separate_params
@@ -599,12 +600,12 @@ class SaveLoadConfig(object):
     def keep_name_table(self):
         """
         Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
-        This dict is the debugging information saved when call `paddle.save`. 
+        This dict is the debugging information saved when call ``paddle.save`` . 
         It is generally only used for debugging and does not affect the actual training or inference. 
-        By default, it will not be retained in `paddle.load` result. Default: False.
+        By default, it will not be retained in ``paddle.load`` result. Default: False.
         
         .. note::
-            Only used for ``paddle.load``.
+            Only used for ``paddle.load`` .
 
         Examples:
             .. code-block:: python
@@ -616,11 +617,11 @@ class SaveLoadConfig(object):
                 linear = paddle.nn.Linear(5, 1)
 
                 state_dict = linear.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                paddle.save(state_dict, "paddle_dy.pdparams")
 
-                configs = paddle.SaveLoadConfig()
-                configs.keep_name_table = True
-                para_state_dict, _ = paddle.load("paddle_dy", configs)
+                config = paddle.SaveLoadConfig()
+                config.keep_name_table = True
+                para_state_dict = paddle.load("paddle_dy.pdparams", config)
 
                 print(para_state_dict)
                 # the name_table is 'StructuredToParameterName@@'
@@ -867,7 +868,7 @@ def save(layer, model_path, input_spec=None, config=None):
 
     # 2. get program from Layer
     # TODO(chenweihang): add support for other method, not only forward
-    if isinstance(layer.forward, StaticLayer):
+    if isinstance(layer.forward, StaticFunction):
         concrete_program = layer.forward.concrete_program
     else:
         # transform in jit.save, if input_spec is incomplete, declarative will throw error
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 9c79deaab73ff7bde9a2414ceb67ad0d04103498..88e24e7e1ea99ffe6d4694ad7dd8f8a363cce71d 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -970,12 +970,12 @@ class Layer(core.Layer):
                 
                 paddle.disable_static()
                 
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                paddle.save(state_dict, "paddle_dy.pdparams")
                 
-                para_state_dict, _ = paddle.load("paddle_dy")
+                para_state_dict = paddle.load("paddle_dy.pdparams")
 
                 emb.set_state_dict(para_state_dict)
 
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 3aa7b9dfc262810686319819f717f3cfd06b5e50..68206f62860852b1124b65da0e4124f60a2a8051 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 from .. import core
 from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
-from ..layers import common_methods
-from . import to_variable, no_grad
+from . import no_grad
 
 import numpy as np
 import six
@@ -53,47 +52,25 @@ def monkey_patch_math_varbase():
 
     def astype(self, dtype):
         """
-        **Notes**:
-            **The variable must be a** :ref:`api_fluid_Tensor`
 
-        Cast a variable to a specified data type.
+        Cast a Tensor to a specified data type.
 
         Args:
-
-            self(Variable): The source variable
-
-            dtype: The target data type
+            dtype: The target data type.
 
         Returns:
-            Variable: Variable with new dtype
+            Tensor: a new Tensor with target dtype
 
         Examples:
-            In Static Graph Mode:
-
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                startup_prog = fluid.Program()
-                main_prog = fluid.Program()
-                with fluid.program_guard(startup_prog, main_prog):
-                    original_variable = fluid.data(name = "new_variable", shape=[2,2], dtype='float32')
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}".format(new_variable.dtype))
-
-            In Dygraph Mode:
-
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
                 import numpy as np
 
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    original_variable = fluid.dygraph.to_variable(x)
-                    print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype))
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}, numpy dtype is {}".format(new_variable.dtype, new_variable.numpy().dtype))
+                original_tensor = paddle.ones([2, 2])
+                print("original tensor's dtype is: {}".format(original_tensor.dtype))
+                new_tensor = original_tensor.astype('float32')
+                print("new tensor's dtype is: {}".format(new_tensor.dtype))
 
         """
         if not isinstance(dtype, core.VarDesc.VarType):
@@ -147,6 +124,10 @@ def monkey_patch_math_varbase():
     def _ndim_(var):
         return len(var.shape)
 
+    @property
+    def _size_(var):
+        return np.prod(var.shape)
+
     def _scalar_add_(var, value):
         return _scalar_elementwise_op_(var, 1.0, value)
 
@@ -208,7 +189,6 @@ def monkey_patch_math_varbase():
         __impl__.__doc__ = """
         {0}
         Args:
-            self(Tensor): left hand Tensor
             other_var(Tensor|float|int): right hand Tensor
 
         Returns:
@@ -217,23 +197,7 @@ def monkey_patch_math_varbase():
         __impl__.__name__ = method_name
         return __impl__
 
-    # Todo(zhouwei): implement dygraph template to adapt to any function, receive('op_type', 'arg_template')
-    #  Such as _method_creator_('addmm', 'x, y, alpha=1.0, beta=1.0, name=None'). It can reduce call time.
-    def _method_creator_(op_type, arg_template=None):
-        def __impl__(self):
-            op = getattr(core.ops, op_type)
-            return op(self)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     varbase_methods = [
-        # Type1: From custom fun or lambda
-        ##   b=-a
         ('__neg__', _neg_),
         ('__float__', _float_),
         ('__long__', _long_),
@@ -244,8 +208,7 @@ def monkey_patch_math_varbase():
         ('dim', lambda x: len(x.shape)),
         ('ndimension', lambda x: len(x.shape)),
         ('ndim', _ndim_),
-        ('size', lambda x: x.shape),
-        # Type2: From Template that create core.ops automatically. It's recommended.
+        ('size', _size_),
         ('__add__',
          _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
         ##  a+b == b+a. Do not need to reverse explicitly
@@ -283,31 +246,7 @@ def monkey_patch_math_varbase():
         ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
         ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
-        ('__array_ufunc__', None),
-        ('sigmoid', _method_creator_('sigmoid', 'name=None')),
-        ('log_sigmoid', _method_creator_('logsigmoid', 'name=None')),
-        ('exp', _method_creator_('exp', 'name=None')),
-        ('tanh', _method_creator_('tanh', 'name=None')),
-        ('atan', _method_creator_('atan', 'name=None')),
-        ('tanh_shrink', _method_creator_('tanh_shrink', 'name=None')),
-        ('sqrt', _method_creator_('sqrt', 'name=None')),
-        ('rsqrt', _method_creator_('rsqrt', 'name=None')),
-        ('abs', _method_creator_('abs', 'name=None')),
-        ('ceil', _method_creator_('ceil', 'name=None')),
-        ('floor', _method_creator_('floor', 'name=None')),
-        ('cos', _method_creator_('cos', 'name=None')),
-        ('acos', _method_creator_('acos', 'name=None')),
-        ('asin', _method_creator_('asin', 'name=None')),
-        ('sin', _method_creator_('sin', 'name=None')),
-        ('sinh', _method_creator_('sinh', 'name=None')),
-        ('cosh', _method_creator_('cosh', 'name=None')),
-        ('round', _method_creator_('round', 'name=None')),
-        ('reciprocal', _method_creator_('reciprocal', 'name=None')),
-        ('square', _method_creator_('square', 'name=None')),
-        ('softplus', _method_creator_('softplus', 'name=None')),
-        ('softsign', _method_creator_('softsign', 'name=None')),
-        # Type3: Form module 'paddle.tensor' defaultly.
-        #   It's not a goodway, because it will increase call time.
+        ('__array_ufunc__', None)
     ]
 
     global _already_patch_varbase
@@ -318,7 +257,15 @@ def monkey_patch_math_varbase():
             setattr(core.VarBase, method_name, method_impl)
     else:
         import paddle.tensor
-        for method_name in common_methods:
+        # Tensor method from module paddle.tensor
+        tensor_methods = paddle.tensor.linalg.__all__ + \
+                         paddle.tensor.math.__all__ + \
+                         paddle.tensor.logic.__all__ + \
+                         paddle.tensor.manipulation.__all__ + \
+                         paddle.tensor.search.__all__ + \
+                         paddle.tensor.stat.__all__ + \
+                         paddle.tensor.attribute.__all__
+        for method_name in tensor_methods:
             if hasattr(core.VarBase, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
             if method_impl: setattr(core.VarBase, method_name, method_impl)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 472022bced7e3e2dd11d301501ebaec75e5e412a..de761cad5298aa00e05c664fb98077d73622e692 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -610,13 +610,13 @@ class DataParallel(layers.Layer):
 
                 paddle.disable_static()
 
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
                 emb = fluid.dygraph.DataParallel(emb, strategy)
 
                 state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                paddle.save(state_dict, "paddle_dy.pdparams")
 
-                para_state_dict, _ = paddle.load("paddle_dy")
+                para_state_dict = paddle.load("paddle_dy.pdparams")
 
                 emb.set_state_dict(para_state_dict)
 
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2e3f34f41648a9343b4bccd1044bcd3f7b3d8189..3dc30767e5aa42d6a0a9f673e093f40045cbed87 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1355,7 +1355,7 @@ class Executor(object):
         if not program._fleet_opt is None:
             if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
                 is_heter = 1
-            if program._fleet_opt("trainer", "") == "HeterXpuTrainer":
+            if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
                 is_heter = 1
         if scope is None:
             scope = global_scope()
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 797b32f5d4768af59fa4e6aceb75e4b6d9029d91..c7e66bb28770a659626c05dccbca7aa5d6bad10c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -217,7 +217,16 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
         assert in_dygraph_mode(
-        ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__
+        ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+def _static_only_(func):
+    def __impl__(*args, **kwargs):
+        assert not in_dygraph_mode(
+        ), "We only support '%s()' in static graph mode, please call 'paddle.enable_static()' to enter static graph mode." % func.__name__
         return func(*args, **kwargs)
 
     return __impl__
@@ -260,6 +269,7 @@ def deprecate_stat_dict(func):
 
 dygraph_not_support = wrap_decorator(_dygraph_not_support_)
 dygraph_only = wrap_decorator(_dygraph_only_)
+static_only = wrap_decorator(_static_only_)
 fake_interface_only = wrap_decorator(_fake_interface_only_)
 
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index 236cb458be4c6a07f768761b41464e64d4d53f77..e556a98ed7504b199624deeac10ea594efa269b4 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -191,12 +191,14 @@ class FleetTranspiler(Fleet):
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
+
         self._communicator.init_with_ctx(send_ctx, recv_ctx)
 
         if not self._communicator.is_running():
             self._communicator.start()
         else:
-            warnings.warn("communicator has been initialized, skip")
+            raise ValueError(
+                "Communicator can only be inited once, please check")
 
     def init_worker(self):
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
index 05deff10a2e1c914e9725c7d8697a704db6e7e42..a60c4e149f582e4f364910611d18cda5fbca4f07 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -624,6 +624,7 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
         value_dims = []
         grad = None
         opt_idx = -1
+        fuse = False
 
         for op in block.ops:
             opt_idx += 1
@@ -631,6 +632,9 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
             if op.type not in opt_value_map.keys():
                 continue
 
+            if op.type in ["sgd", "adam"]:
+                fuse = True
+
             grad = main_program.global_block().vars[op.input("Grad")[0]]
 
             for value in opt_value_map[op.type]:
@@ -644,7 +648,67 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
 
             if value_names:
                 break
-        return grad, opt_idx, value_names, value_dims, acture_names
+        return grad, opt_idx, value_names, value_dims, acture_names, fuse
+
+    def add_fuse_large_scale_op(block, global_block, table_name, value_names,
+                                acture_names, grad, is_entry, opt_idx):
+
+        op = block.ops[opt_idx]
+
+        if op.type == "sgd":
+            grad = main_program.global_block().vars[op.input("Grad")[0]]
+            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
+
+            block._insert_op(
+                opt_idx,
+                type="lookup_sparse_table_fuse_sgd",
+                inputs={"Grad": grad,
+                        "LearningRate": lr},
+                attrs={
+                    "is_entry": is_entry,
+                    "tablename": table_name,
+                    "value_names": value_names
+                })
+
+        elif op.type == "adam":
+            grad = main_program.global_block().vars[op.input("Grad")[0]]
+            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
+            beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[
+                0]]
+            beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[
+                0]]
+            beta1_pow_o = main_program.global_block().vars[op.output(
+                "Beta1PowOut")[0]]
+            beta2_pow_o = main_program.global_block().vars[op.output(
+                "Beta2PowOut")[0]]
+
+            beta1 = op.attr('beta1')
+            beta2 = op.attr('beta2')
+            epsilon = op.attr('epsilon')
+
+            block._insert_op(
+                opt_idx,
+                type="lookup_sparse_table_fuse_adam",
+                inputs={
+                    "Grad": grad,
+                    "LearningRate": lr,
+                    "Beta1Pow": beta1_pow,
+                    "Beta2Pow": beta2_pow
+                },
+                outputs={
+                    "Beta1PowOut": beta1_pow_o,
+                    "Beta2PowOut": beta2_pow_o
+                },
+                attrs={
+                    "beta1": beta1,
+                    "beta2": beta2,
+                    "epsilon": epsilon,
+                    "is_entry": is_entry,
+                    "tablename": table_name,
+                    "value_names": value_names
+                })
+        else:
+            raise ValueError("only support sgd/adam optimizer now")
 
     def add_large_scale_op(block, global_block, table_name, value_names,
                            acture_names, grad, is_entry, opt_idx):
@@ -711,24 +775,35 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
         for param, blockid in param_blockid_map.items():
             opt_block = program.block(blockid)
 
-            grad, opt_idx, value_names, value_dims, acture_names = \
+            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                 get_optimizer_values(opt_block)
 
             entry_attr = get_entry_attr(param)
             is_entry = False if entry_attr == "none" else True
-            add_large_scale_op(opt_block,
-                               program.global_block(), param, value_names,
-                               acture_names, grad, is_entry, opt_idx)
 
+            if fuse:
+                add_fuse_large_scale_op(opt_block,
+                                        program.global_block(), param,
+                                        value_names, acture_names, grad,
+                                        is_entry, opt_idx)
+            else:
+                add_large_scale_op(opt_block,
+                                   program.global_block(), param, value_names,
+                                   acture_names, grad, is_entry, opt_idx)
     else:
         large_scale_kv_metas = []
         for param, blockid in param_blockid_map.items():
             opt_block = main_program.block(blockid)
-            grad, _, value_names, value_dims, acture_names = \
+
+            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                 get_optimizer_values(opt_block)
 
             entry_attr = get_entry_attr(param)
 
+            if fuse:
+                # remove origin optimzier op
+                opt_block._remove_op(opt_idx)
+
             # training/infer
             mode = "0"
             names_str = ",".join(value_names)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 4543af9820e8c9326098fa254494ca1c896d3b12..3f826da3ae2beca51b639a69da4113e6d9580d6c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -227,22 +227,6 @@ def init_from_server_pass(program, config):
     fetch_barrier_out = program.global_block().create_var(
         name=framework.generate_control_dev_var_name())
 
-    recv_ctx = config.get_communicator_recv_context(recv_type=1)
-    recv_varnames = []
-
-    for name, ctxs in recv_ctx.items():
-        recv_varnames.extend(ctxs.origin_varnames())
-
-    program.global_block().append_op(
-        type="recv",
-        inputs={"X": []},
-        outputs={"Out": []},
-        attrs={
-            "recv_varnames": recv_varnames,
-            "trainer_id": config.get_role_id(),
-            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-        })
-
     program.global_block().append_op(
         type="fetch_barrier",
         inputs={},
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
index 60378aa98272dae32a97b33e84fc61e71193658c..06a90b78fd2e53d065f1abbaf9e95df848f9cc52 100644
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
@@ -164,8 +164,8 @@ def train(args):
     elif fleet.is_worker():
         logger.info("run trainer")
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         thread_num = 2
         filelist = []
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index d513d44acfff230eb229e161e689fbc60a73c602..6b98dea42903e1392febd14b739b49cec7bc8c14 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -31,6 +31,7 @@ from ..unique_name import generate as unique_name
 
 import logging
 from ..data_feeder import check_dtype, check_type
+from paddle.fluid.framework import static_only
 
 __all__ = [
     'data', 'read_file', 'double_buffer', 'py_reader',
@@ -38,6 +39,7 @@ __all__ = [
 ]
 
 
+@static_only
 def data(name,
          shape,
          append_batch_size=True,
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 4595f0cf93916d71a3d0ec582af1917500d68f12..92b58a7e2ee4c76af7047a14f67e40d76be76dc0 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -54,29 +54,6 @@ EXPRESSION_MAP = {
     "__ge__": "A >= B"
 }
 
-# method for Tensor from paddle.tensor
-# edit it when paddle.tensor has new method about Tensor operation
-common_methods = [
-    'exp', 'tanh', 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', 'cos',
-    'acos', 'asin', 'sin', 'sinh', 'cosh', 'round', 'reciprocal', 'square',
-    'rank', 'matmul', 'dot', 'norm', 'transpose', 'dist', 't', 'cross',
-    'cholesky', 'bmm', 'histogram', 'equal', 'greater_equal', 'greater_than',
-    'is_empty', 'isfinite', 'less_equal', 'less_than', 'logical_and',
-    'logical_not', 'logical_or', 'logical_xor', 'not_equal', 'reduce_all',
-    'reduce_any', 'allclose', 'equal_all', 'cast', 'expand', 'expand_as',
-    'tile', 'flatten', 'gather', 'gather_nd', 'reshape', 'reverse', 'scatter',
-    'scatter_nd_add', 'scatter_nd', 'shard_index', 'slice', 'split', 'squeeze',
-    'strided_slice', 'unique', 'unique_with_counts', 'unsqueeze', 'flip',
-    'unbind', 'roll', 'cumsum', 'increment', 'log', 'pow', 'reciprocal',
-    'round', 'rsqrt', 'scale', 'sign', 'stanh', 'sum', 'reduce_prod', 'max',
-    'min', 'mm', 'div', 'multiply', 'add', 'logsumexp', 'log1p', 'erf',
-    'addcmul', 'addmm', 'clamp', 'trace', 'kron', 'argmax', 'argmin', 'argsort',
-    'has_inf', 'has_nan', 'topk', 'index_select', 'nonzero', 'sort',
-    'index_sample', 'mean', 'std', 'var', 'elementwise_add', 'elementwise_div',
-    'elementwise_floordiv', 'elementwise_mod', 'elementwise_pow',
-    'elementwise_sub'
-]
-
 _already_patch_variable = False
 
 
@@ -372,7 +349,14 @@ def monkey_patch_variable():
             setattr(Variable, method_name, method_impl)
     else:
         import paddle.tensor
-        for method_name in common_methods:
+        variabel_methods = paddle.tensor.linalg.__all__ + \
+                           paddle.tensor.math.__all__ + \
+                           paddle.tensor.logic.__all__ + \
+                           paddle.tensor.manipulation.__all__ + \
+                           paddle.tensor.search.__all__ + \
+                           paddle.tensor.stat.__all__ + \
+                           paddle.tensor.attribute.__all__
+        for method_name in variabel_methods:
             if hasattr(Variable, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
             if method_impl: setattr(Variable, method_name, method_impl)
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index cf52f3b00fb2739d186021dc51d6aa0f506be706..2fba578ec077f2a74388d433bf3ab5b3098e81ad 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -1453,11 +1453,14 @@ def linspace(start, stop, num, dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
-        tensor_start = fill_constant([1], dtype, start)
+        with device_guard("cpu"):
+            tensor_start = fill_constant([1], dtype, start)
     if not isinstance(stop, Variable):
-        tensor_stop = fill_constant([1], dtype, stop)
+        with device_guard("cpu"):
+            tensor_stop = fill_constant([1], dtype, stop)
     if not isinstance(num, Variable):
-        tensor_num = fill_constant([1], 'int32', num)
+        with device_guard("cpu"):
+            tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
         return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
                                  dtype)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 1e7915ed781a6441f32fb86c3c92e6f68ca66b93..0dd1694c86c354d5cab4e1081a16eb90da3872d5 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -182,23 +182,25 @@ class Optimizer(object):
         Examples:
             .. code-block:: python
 
-                import paddle   
+                import paddle
+                import paddle.fluid as fluid
 
                 paddle.disable_static()
 
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                fluid.save_dygraph(state_dict, "paddle_dy")
 
-                adam = paddle.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
-                                                parameter_list=emb.parameters())
+                scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+                    d_model=0.01, warmup_steps=100, verbose=True)
+                adam = paddle.optimizer.Adam(
+                    learning_rate=scheduler,
+                    parameters=emb.parameters())
                 state_dict = adam.state_dict()
+                fluid.save_dygraph(state_dict, "paddle_dy")
 
-                para_state_dict, opti_state_dict = paddle.load("paddle_dy")
-
-                adam.set_state_dict(opti_state_dict)
-
+                para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
         '''
         from paddle.optimizer.lr_scheduler import _LRScheduler
         if isinstance(self._learning_rate, _LRScheduler):
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 533222531f98b188f9fe5b47184ff39736488bd6..6cc00a7fd37347c0a85859f3e4177c33073264d6 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -167,10 +167,10 @@ class DataLoader(object):
             The variables should be created by :code:`fluid.data()`.
             :attr:`feed_list` must be set if :attr:`return_list` is
             False. Default None.
-        places(list(Place)|tuple(Place)): a list of Place, to put data
-            onto, :attr:`places` must be set in both static graph and 
-            dynamic graph mode, in dynamic graph mode, place number must
-            be 1. Default None.
+        places(list(Place)|tuple(Place)|optional): a list of Place,
+            to put data onto, :attr:`places` can be None, if 
+            :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
+            will be used. Default None.
         return_list (bool): whether the return value on each device is 
             presented as a list. If :attr:`return_list=False`, the return
             value on each device would be a dict of str -> LoDTensor, where
@@ -222,6 +222,8 @@ class DataLoader(object):
         .. code-block:: python
 
             import numpy as np
+
+            import paddle
             import paddle.fluid as fluid
             from paddle.io import Dataset, BatchSampler, DataLoader
 
@@ -247,11 +249,48 @@ class DataLoader(object):
                 def __len__(self):
                     return self.num_samples
 
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+
             # get places
             places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
 
+            # --------------------- dygraph mode --------------------
+
+            class SimpleNet(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
+                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
+
+                def forward(self, image, label=None):
+                    return self.fc(image)
+
+            with fluid.dygraph.guard(places[0]):
+                simple_net = SimpleNet()
+                opt = fluid.optimizer.SGD(learning_rate=1e-3,
+                                          parameter_list=simple_net.parameters())
+
+                loader = DataLoader(dataset,
+                                    batch_size=BATCH_SIZE,
+                                    shuffle=True,
+                                    drop_last=True,
+                                    num_workers=2)
+
+                for e in range(EPOCH_NUM):
+                    for i, (image, label) in enumerate(loader()):
+                        out = simple_net(image)
+                        loss = fluid.layers.cross_entropy(out, label)
+                        avg_loss = fluid.layers.reduce_mean(loss)
+                        avg_loss.backward()
+                        opt.minimize(avg_loss)
+                        simple_net.clear_gradients()
+                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
+
+            # -------------------------------------------------------
+
             # -------------------- static graph ---------------------
 
+            paddle.enable_static()
+
             def simple_net(image, label):
                 fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
                 cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
@@ -270,11 +309,8 @@ class DataLoader(object):
 
             prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
 
-            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-
             loader = DataLoader(dataset,
                                 feed_list=[image, label],
-                                places=places,
                                 batch_size=BATCH_SIZE, 
                                 shuffle=True,
                                 drop_last=True,
@@ -287,39 +323,6 @@ class DataLoader(object):
 
             # -------------------------------------------------------
                 
-            # --------------------- dygraph mode --------------------
-
-            class SimpleNet(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(SimpleNet, self).__init__()
-                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
-
-                def forward(self, image, label=None):
-                    return self.fc(image)
-
-            with fluid.dygraph.guard(places[0]):
-                simple_net = SimpleNet()
-                opt = fluid.optimizer.SGD(learning_rate=1e-3,
-                                          parameter_list=simple_net.parameters())
-
-                loader = DataLoader(dataset,
-                                    places=places[0],
-                                    batch_size=BATCH_SIZE,
-                                    shuffle=True,
-                                    drop_last=True,
-                                    num_workers=2)
-
-                for e in range(EPOCH_NUM):
-                    for i, (image, label) in enumerate(loader()):
-                        out = simple_net(image)
-                        loss = fluid.layers.cross_entropy(out, label)
-                        avg_loss = fluid.layers.reduce_mean(loss)
-                        avg_loss.backward()
-                        opt.minimize(avg_loss)
-                        simple_net.clear_gradients()
-                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
-
-            # -------------------------------------------------------
 
     .. note::
         For reading iterable dataset with multiprocess Dataloader,
@@ -356,11 +359,9 @@ class DataLoader(object):
                     "feed_list should be set when return_list=False"
         self.feed_list = feed_list
 
-        assert places is not None, "places cannot be None"
+        if places is None:
+            places = _current_expected_place()
         self.places = _convert_places(places)
-        if in_dygraph_mode():
-            assert len(self.places) == 1, \
-                    "Number of places must be 1 in dygraph mode"
 
         assert num_workers >= 0, "num_workers should be a non-negative value"
         if num_workers > 0 and (sys.platform == 'darwin' or
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 673c965b662a022739f8d489c331f4de9455a926..96321aae566d1f910042f4e348d0be8b3e88c341 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -4,4 +4,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 # default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
+    set_tests_properties(${src} PROPERTIES FIXTURES_SETUP ${src}_infer_model)
 endforeach()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8d236dca22f2266771a029b7cdbf7db21aefb1fe..0979757680103fda081fe891486e0c1371a8b59d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_fp16_allreduce_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
@@ -335,7 +336,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
 
-
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
@@ -456,6 +456,7 @@ if(WITH_DISTRIBUTE)
 	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
+           py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 8277499fcce341207fa75a74dfda0a2ccc2e3b63..f650dd0f7e9824ce2c2c495f1fb85fe1fc421d08 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -28,7 +28,6 @@ import numpy as np
 
 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 paddle.enable_static()
 
@@ -163,8 +162,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
+
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
@@ -178,13 +179,13 @@ class TestDistCTR2x2(FleetDistRunnerBase):
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
                     # TODO(randomly fail)
-                    #   reduce_output = fleet_util.all_reduce(
+                    #   reduce_output = fleet.util.all_reduce(
                     #       np.array(loss_val), mode="sum")
-                    #   loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    #   loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     #   loss_val = float(reduce_output) / len(loss_all_trainer)
                     message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
                                                                       loss_val)
-                    fleet_util.print_on_rank(message, 0)
+                    fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
             except fluid.core.EOFException:
@@ -202,8 +203,8 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         thread_num = 2
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 0e3c80992771424e4216a79b991de1c62884c757..7accc917f808e6f010e7f38de651a8a3965fcdf3 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -29,7 +29,6 @@ import numpy as np
 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
 from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
@@ -60,8 +59,9 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(device_id)
         exe = fluid.Executor(place)
-        fleet.init_worker()
+
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
@@ -75,13 +75,13 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
                     loss_val = exe.run(program=fleet.main_program,
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
-                    reduce_output = fleet_util.all_reduce(
+                    reduce_output = fleet.util.all_reduce(
                         np.array(loss_val), mode="sum")
-                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     loss_val = float(reduce_output) / len(loss_all_trainer)
                     message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
                                                                       loss_val)
-                    fleet_util.print_on_rank(message, 0)
+                    fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
             except fluid.core.EOFException:
@@ -104,8 +104,8 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
         place = fluid.CUDAPlace(device_id)
         exe = fluid.Executor(place)
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         thread_num = 2
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 2f938a813d8a7598e49023066759a490eab53263..f62ad66e462862f4c3f04bacc58ca7aac583ef1e 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -29,7 +29,6 @@ import numpy as np
 import ctr_dataset_reader
 from test_dist_fleet_heter_base import runtime_main, FleetDistHeterRunnerBase
 from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 paddle.enable_static()
 
@@ -152,8 +151,9 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
@@ -176,12 +176,12 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
-        filelist = fleet_util.get_file_shard(train_file_list)
+        filelist = fleet.util.get_file_shard(train_file_list)
         print("filelist: {}".format(filelist))
 
         # config dataset
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 2ea69e1b6763087bb2b278b59a8a59b4331847da..fb7ddef862d0f0bd54f10e090f6072a3dd808057 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -32,7 +32,6 @@ import os
 import signal
 from functools import reduce
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 paddle.enable_static()
 
@@ -198,7 +197,7 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
     def net(self, args, batch_size=4, lr=0.01):
         avg_cost, _, predict, self.reader = \
             train_network(batch_size=batch_size, is_distributed=False,
-                               is_sparse=True, is_self_contained_lr=False, is_pyreader=(args.reader == "pyreader"))
+                          is_sparse=True, is_self_contained_lr=False, is_pyreader=(args.reader == "pyreader"))
         self.avg_cost = avg_cost
         self.predict = predict
 
@@ -222,8 +221,8 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
         batch_size = 4
         # reader
         train_reader = paddle.batch(fake_simnet_reader(), batch_size=batch_size)
@@ -238,7 +237,7 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
                     loss_val = np.mean(loss_val)
                     message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
                                                                       loss_val)
-                    fleet_util.print_on_rank(message, 0)
+                    fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
             except fluid.core.EOFException:
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index 77697896b4d556da8a98c17e281b3d7a6999fd64..81530573a604205f0202d088853038bbc71b92e6 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -151,8 +151,9 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
+
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         batch_size = 4
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..3198c6cac86c26bc6708df509ffc88aca12d52f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from paddle.distributed.fleet.meta_optimizers import FP16AllReduceOptimizer as FP16AllReduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = FP16AllReduce(opt)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 450ef7557bc1574c31a00d05154aead19083c1bc..095eda2a5cba72ed21e5aa64a5990cd1ef2c27e0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -19,7 +19,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.static import InputSpec
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram, StaticLayer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram, StaticFunction
 
 from test_basic_api_transformation import dyfunc_to_variable
 
@@ -81,14 +81,14 @@ class SimpleNet(Layer):
         return z
 
 
-class TestStaticLayerInstance(unittest.TestCase):
+class TestStaticFunctionInstance(unittest.TestCase):
     def test_instance_same_class(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
             net_1 = SimpleNet()
             net_2 = SimpleNet()
 
-            self.assertTrue(isinstance(net_1.forward, StaticLayer))
-            self.assertTrue(isinstance(net_2.forward, StaticLayer))
+            self.assertTrue(isinstance(net_1.forward, StaticFunction))
+            self.assertTrue(isinstance(net_2.forward, StaticFunction))
             self.assertNotEqual(net_1.forward, net_2.forward)
 
             # convert layer into static progam of net_1
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d96994a33b2c05446b67df44bd8999352373d43
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvActivationMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                bias_attr=self.conv_bias_attr,
+                act=self.act)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "relu"
+        self.pass_name = 'conv_relu_mkldnn_fuse_pass'
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ConvActivationMkldnnFusePassTest_1(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "relu"
+        self.pass_name = 'conv_relu_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_2(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "leaky_relu"
+        self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_3(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "leaky_relu"
+        self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "relu6"
+        self.pass_name = 'conv_relu6_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "swish"
+        self.pass_name = 'conv_swish_mkldnn_fuse_pass'
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..45097f6b8191d045d0665d7478e4090c0ae20cb3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvConcatReluMkldnnFusePassTest_0(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_1 = fluid.data(
+                name="data_1", shape=[-1, 3, 100, 100], dtype="float32")
+            data_2 = fluid.data(
+                name="data_2", shape=[-1, 3, 100, 100], dtype="float32")
+            conv_1 = fluid.layers.conv2d(
+                data_1,
+                num_filters=self.conv1_num_filters,
+                filter_size=self.conv1_filter_size,
+                padding=self.conv1_padding,
+                bias_attr=self.conv1_bias_attr)
+            conv_2 = fluid.layers.conv2d(
+                data_2,
+                num_filters=self.conv2_num_filters,
+                filter_size=self.conv2_filter_size,
+                padding=self.conv2_padding,
+                bias_attr=self.conv2_bias_attr)
+            concat = fluid.layers.concat(
+                [conv_1, conv_2], axis=self.concat_axis)
+            out = fluid.layers.relu(concat)
+
+        self.feeds = {
+            "data_1": np.random.random((1, 3, 100, 100)).astype("float32"),
+            "data_2": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.conv1_num_filters = 3
+        self.conv1_filter_size = 3
+        self.conv1_padding = 0
+        self.conv1_bias_attr = False
+        self.conv2_num_filters = 3
+        self.conv2_filter_size = 3
+        self.conv2_padding = 0
+        self.conv2_bias_attr = False
+        self.concat_axis = 0
+        self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ConvConcatReluMkldnnFusePassTest_1(ConvConcatReluMkldnnFusePassTest_0):
+    def set_params(self):
+        self.conv1_num_filters = 3
+        self.conv1_filter_size = 3
+        self.conv1_padding = 0
+        self.conv1_bias_attr = False
+        self.conv2_num_filters = 5
+        self.conv2_filter_size = 5
+        self.conv2_padding = 1
+        self.conv2_bias_attr = True
+        self.concat_axis = 1
+        self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6b5e0e54739b37b5f2e490fc890fbe56c2f83f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class MatmulTransposeReshapeMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=self.data_shape, dtype="float32")
+            weight = fluid.layers.create_parameter(
+                shape=self.weight_shape, dtype="float32")
+            matmul = fluid.layers.matmul(
+                data,
+                weight,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y)
+            transpose = fluid.layers.transpose(matmul, self.tranpose_perm)
+            reshape = fluid.layers.reshape(transpose, shape=self.reshape_shape)
+
+        self.fetch_list = [reshape]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.data_shape = [-1, 3, 100, 110]
+        self.weight_shape = [1, 3, 110, 100]
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 110)).astype("float32")
+        }
+        self.transpose_x = False
+        self.transpose_y = False
+        self.tranpose_perm = [0, 2, 1, 3]
+        self.reshape_shape = [3, 100, 100]
+        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class MatmulTransposeReshapeMkldnnFusePassTest_1(
+        MatmulTransposeReshapeMkldnnFusePassTest):
+    def set_params(self):
+        self.data_shape = [-1, 3, 100, 100]
+        self.weight_shape = [1, 3, 100, 100]
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.transpose_x = True
+        self.transpose_y = True
+        self.tranpose_perm = [0, 2, 1, 3]
+        self.reshape_shape = [6, 50, 100]
+        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..55a6b543f0aeafe75940255565e3f02ae9194b99
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class ScaleMatmulMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[1, 3, 100, 100], dtype="float32")
+            weight = fluid.layers.create_parameter(
+                shape=[1, 3, 100, 100], dtype="float32")
+            scale = fluid.layers.scale(data, scale=self.scale_scale)
+            matmul = fluid.layers.matmul(
+                scale,
+                weight,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y)
+
+        self.fetch_list = [matmul]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.scale_scale = 2.0
+        self.transpose_x = False
+        self.transpose_y = False
+        self.pass_name = "scale_matmul_fuse_pass"
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ScaleMatmulMkldnnFusePassTest_1(ScaleMatmulMkldnnFusePassTest):
+    def set_params(self):
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.scale_scale = 5.0
+        self.transpose_x = True
+        self.transpose_y = True
+        self.pass_name = "scale_matmul_fuse_pass"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
similarity index 57%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
index 2346e93d64dce21d9bdd7687bd8d5ed38ff5f188..33f215dafda21c68af3edb6baaeca802edf82c5a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
@@ -20,25 +20,13 @@ from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
 
 
-class ConvBnFusePassMKLDNNTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                data, num_filters=3, filter_size=3, bias_attr=False, act="relu")
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
+class SeqConcatFCFusePassTest(InferencePassTest):
+    def test_compatible(self):
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seq_concat_fc_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
index 8d2677229a03f7bdac14a93e176747ba0a5f1d6b..ab1127afa58dd93aa92688eebdf82292990f59b1 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -47,7 +47,7 @@ class TestSimpleRNNCell(unittest.TestCase):
         prev_h = np.random.randn(4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_with_zero_state(self):
@@ -57,7 +57,7 @@ class TestSimpleRNNCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def runTest(self):
@@ -90,7 +90,7 @@ class TestGRUCell(unittest.TestCase):
         prev_h = np.random.randn(4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_with_zero_state(self):
@@ -100,7 +100,7 @@ class TestGRUCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def runTest(self):
@@ -134,8 +134,8 @@ class TestLSTMCell(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
         y2, (h2, c2) = rnn2(
-            paddle.to_variable(x),
-            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+            paddle.to_tensor(x),
+            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -146,7 +146,7 @@ class TestLSTMCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, (h1, c1) = rnn1(x)
-        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index ef297b3bb62497073fd667238cae8a83daaa4967..7c03b51837ef6f7be8021dca55daf3b43f7d3053 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -53,7 +53,7 @@ class TestSimpleRNN(unittest.TestCase):
         prev_h = np.random.randn(2 * self.num_directions, 4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -66,7 +66,7 @@ class TestSimpleRNN(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -81,11 +81,11 @@ class TestSimpleRNN(unittest.TestCase):
 
         y1, h1 = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
@@ -133,7 +133,7 @@ class TestGRU(unittest.TestCase):
         prev_h = np.random.randn(2 * self.num_directions, 4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -146,7 +146,7 @@ class TestGRU(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -161,11 +161,11 @@ class TestGRU(unittest.TestCase):
 
         y1, h1 = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
@@ -209,8 +209,8 @@ class TestLSTM(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
         y2, (h2, c2) = rnn2(
-            paddle.to_variable(x),
-            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+            paddle.to_tensor(x),
+            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
@@ -224,7 +224,7 @@ class TestLSTM(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, (h1, c1) = rnn1(x)
-        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
@@ -240,11 +240,11 @@ class TestLSTM(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index db9e8d2c6bda011bef7c23e7fb51e246137a3906..e8b8a45fb677568947be82a1c77e6f2e7a17cdc1 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestAbsDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        shape = [2, 3, 7, 9]
+        eps = 1e-6
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.abs(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index f6ba03194aa909279aa2cd884fc575041b01a4cd..791f1ee2dfa534437deb903fc60e2904a8b396a1 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard
 
 class TestSqrtOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of sqrt op must be Variable or numpy.ndarray.
             in1 = 1
@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase):
 
 class TestActivation(OpTest):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "exp"
         self.init_dtype()
         self.init_kernel_type()
@@ -71,6 +73,7 @@ class TestActivation(OpTest):
 
 class TestParameter(object):
     def test_out_name(self):
+        paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
             np_x = np.array([0.1])
             data = fluid.layers.data(name="X", shape=[1])
@@ -92,6 +95,7 @@ class TestParameter(object):
 
 class TestSigmoid(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sigmoid"
         self.init_dtype()
 
@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation):
 
 class TestLogSigmoid(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "logsigmoid"
         self.init_dtype()
 
@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase):
 
 class TestTanh(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "tanh"
         self.init_dtype()
         x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase):
 
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "atan"
         self.init_dtype()
 
@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter):
 
 class TestSinh(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sinh"
         self.init_dtype()
 
@@ -349,6 +357,7 @@ class TestSinh(TestActivation):
 
 class TestSinhOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.sinh, 1)
@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase):
 
 class TestCosh(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "cosh"
         self.init_dtype()
 
@@ -420,6 +430,7 @@ class TestCosh(TestActivation):
 
 class TestCoshOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.cosh, 1)
@@ -438,6 +449,7 @@ def ref_tanhshrink(x):
 
 class TestTanhshrink(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold):
 
 class TestHardShrink(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "hard_shrink"
         self.init_dtype()
 
@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink):
 class TestHardShrinkAPI(unittest.TestCase):
     # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
     def setUp(self):
+        paddle.enable_static()
         self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
             else paddle.CPUPlace()
@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5):
 
 class TestSoftshrink(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "softshrink"
         self.init_dtype()
 
@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase):
 
 class TestSqrt(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sqrt"
         self.init_dtype()
 
@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter):
 
 class TestRsqrt(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "rsqrt"
         self.init_dtype()
 
@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation):
 
 class TestAbs(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "abs"
         self.init_dtype()
 
@@ -792,6 +810,7 @@ class TestAbs(TestActivation):
 
 class TestCeil(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "ceil"
         self.init_dtype()
 
@@ -808,6 +827,7 @@ class TestCeil(TestActivation):
 
 class TestFloor(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "floor"
         self.init_dtype()
 
@@ -826,6 +846,7 @@ class TestFloor(TestActivation):
 
 class TestCos(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "cos"
         self.init_dtype()
 
@@ -843,6 +864,7 @@ class TestCos(TestActivation):
 
 class TestAcos(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "acos"
         self.init_dtype()
 
@@ -860,6 +882,7 @@ class TestAcos(TestActivation):
 
 class TestSin(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sin"
         self.init_dtype()
 
@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter):
 
 class TestAsin(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "asin"
         self.init_dtype()
 
@@ -894,6 +918,7 @@ class TestAsin(TestActivation):
 
 class TestRound(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "round"
         self.init_dtype()
 
@@ -909,6 +934,7 @@ class TestRound(TestActivation):
 
 class TestRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "relu"
         self.init_dtype()
 
@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation):
         return 0.02
 
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "leaky_relu"
         self.init_dtype()
         alpha = self.get_alpha()
@@ -1084,6 +1111,7 @@ def gelu(x, approximate):
 
 class TestGeluApproximate(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = True
@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation):
 
 class TestGelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = False
@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase):
 
 class TestBRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "brelu"
         self.init_dtype()
 
@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation):
 
 class TestBReluOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.brelu, 1)
@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0):
 
 class TestRelu6(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "relu6"
         self.init_dtype()
 
@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase):
 
 class TestHardSwish(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = 'hard_swish'
         self.init_dtype()
 
@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation):
 
 class TestHardSwishOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.hard_swish, 1)
@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase):
 
 class TestSoftRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "soft_relu"
         self.init_dtype()
 
@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation):
 
 class TestSoftReluOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.soft_relu, 1)
@@ -1366,6 +1402,7 @@ def elu(x, alpha):
 
 class TestELU(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "elu"
         self.init_dtype()
 
@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase):
 
 class TestReciprocal(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "reciprocal"
         self.init_dtype()
 
@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation):
 
 class TestLog(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "log"
         self.init_dtype()
 
@@ -1478,6 +1517,7 @@ class TestLog(TestActivation):
 
 class TestLog1p(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "log1p"
         self.init_dtype()
 
@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation):
 
 class TestSquare(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "square"
         self.init_dtype()
 
@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation):
 
 class TestPow(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1557,6 +1599,7 @@ class TestPow(TestActivation):
 
 class TestPow_factor_tensor(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation):
 
 class TestSTanh(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "stanh"
         self.init_dtype()
 
@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation):
 
 class TestSTanhOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.stanh, 1)
@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20):
 
 class TestSoftplus(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "softplus"
         self.init_dtype()
 
@@ -1751,6 +1797,7 @@ def ref_softsign(x):
 
 class TestSoftsign(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "softsign"
         self.init_dtype()
 
@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase):
 
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "thresholded_relu"
         self.init_dtype()
 
@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation):
 
 class TestThresholdedReluOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase):
 
 class TestHardSigmoid(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "hard_sigmoid"
         self.init_dtype()
 
@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation):
 
 class TestHardSigmoidOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1)
@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase):
 
 class TestSwish(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "swish"
         self.init_dtype()
 
@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation):
 
 class TestSwishOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.swish, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 2c9168df472f493a16c19ad1b121ec0d126b6306..9dd617f90b65d4a1960ceaa30762cd8c20e9db09 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase):
                         fetch_val2, = exe.run(compiled_prog,
                                               feed=feed_dict,
                                               fetch_list=[fetch_var])
-
-                        self.assertTrue(np.array_equal(fetch_val1, fetch_val2))
+                        #NOTE(zhiqiu): Temporally changed from array_equal to allclose. 
+                        # The real root is fuse_all_reduce and fuse_all_optimizer_opss may 
+                        # result in diff because of the instruction set on the virtual machine.
+                        # And the related unit tests: test_fuse_all_reduce_pass and test_fuse_optimizer_pass use "almostEqual" in their checks.
+                        # There are also some related issues:
+                        # https://github.com/PaddlePaddle/Paddle/issues/21270
+                        # https://github.com/PaddlePaddle/Paddle/issues/21046
+                        # https://github.com/PaddlePaddle/Paddle/issues/21045
+                        self.assertTrue(np.allclose(fetch_val1, fetch_val2))
 
     def check_multi_card_fetch_var(self):
         if self.is_invalid_test():
@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase):
                         fetch_vals.append(fetch_val)
 
                 for item in fetch_vals:
-                    self.assertTrue(np.array_equal(fetch_vals[0], item))
+                    # save above
+                    self.assertTrue(np.allclose(fetch_vals[0], item))
 
 
 class CUDAInplaceTest(InplaceTestBase):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_async.py b/python/paddle/fluid/tests/unittests/test_communicator_async.py
index d032d6d75b5b3a48ea1e752190952f4c52e23b07..a86b80b2cf98829a683045ae302f72a694809138 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
@@ -30,11 +30,10 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
 
 class TestCommunicator(unittest.TestCase):
     def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        cost = fluid.layers.square_error_cost(input=x, label=y)
         avg_cost = fluid.layers.mean(cost)
         return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index d9fc9262b311f949a1a89cd079517c5c93d0d28d..5916000fba79fc0da2ef545beac634a3edfe01df 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -83,8 +83,8 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
         feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index 391588780f342dc17ea821334e28f941f9ce359a..b0f55f2939dc94af603f4cc5851dbb5e6317774f 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -71,8 +71,8 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
         feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index c0044d9d620796057cce0e3a51b2dec2878a0e17..95b209b14602676a089a667b0a720056bbe1562b 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -27,11 +27,9 @@ import paddle.distributed.fleet as fleet
 
 class TestCommunicator(unittest.TestCase):
     def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        cost = fluid.layers.square_error_cost(input=x, label=y)
         avg_cost = fluid.layers.mean(cost)
         return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py
index 8070148f8b36dd7dab7711abaf25994acebc7e6f..98739f6e1631e5ebd5fc8da45647118be8c05f6f 100644
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
@@ -99,5 +99,17 @@ class TestApiStaticDataError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_type)
 
 
+class TestApiErrorWithDynamicMode(unittest.TestCase):
+    def test_error(self):
+        with program_guard(Program(), Program()):
+            paddle.disable_static()
+            self.assertRaises(AssertionError, fluid.data, 'a', [2, 25])
+            self.assertRaises(
+                AssertionError, fluid.layers.data, 'b', shape=[2, 25])
+            self.assertRaises(
+                AssertionError, paddle.static.data, 'c', shape=[2, 25])
+            paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py b/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
deleted file mode 100644
index b5d5d33fa3fc32a054c23c80d471ce70dd745d08..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-TestCases for Dataset,
-including create, config, run, etc.
-"""
-
-from __future__ import print_function
-import numpy as np
-import unittest
-import os
-import paddle
-import zipfile
-import paddle.dataset.common
-
-URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
-MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
-
-
-class TestDatasetSentiment(unittest.TestCase):
-    """  TestCases for Sentiment. """
-
-    def test_get_word_dict(self):
-        """ Testcase for get_word_dict. """
-        words_freq_sorted = paddle.dataset.sentiment.get_word_dict()
-        print(words_freq_sorted)
-        self.assertTrue(len(words_freq_sorted) == 39768)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index 2a80e20d692c88497e7edccd6eca5509e3522871..97b6594eb382507ccbbb8b6bfad8e5631d534010 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -72,6 +72,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
         test old fluid elementwise_mul api, it should fire Warinng function, 
         which insert the Warinng info on top of API's doc string.
         """
+        paddle.enable_static()
         # Initialization
         x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32')
 
@@ -80,6 +81,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
 
         # captured        
         captured = get_warning_index(fluid.data)
+        paddle.disable_static()
 
         # testting
         self.assertGreater(expected, captured)
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index 780d57b53310bb5f385a131d4ad52dd6f5e695f0..ddf1240e4ef27775a24cee540c5f193399112270 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -119,6 +119,16 @@ class TestDiagV2API(unittest.TestCase):
             (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
                 self.padding_value * np.ones(n))
 
+        self.input_np4 = np.random.random(size=(2000, 2000)).astype(np.float32)
+        self.expected6 = np.diag(self.input_np4)
+        self.expected7 = np.diag(self.input_np4, k=1)
+        self.expected8 = np.diag(self.input_np4, k=-1)
+
+        self.input_np5 = np.random.random(size=(2000)).astype(np.float32)
+        self.expected9 = np.diag(self.input_np5)
+        self.expected10 = np.diag(self.input_np5, k=1)
+        self.expected11 = np.diag(self.input_np5, k=-1)
+
     def run_imperative(self):
         x = paddle.to_tensor(self.input_np)
         y = paddle.diag(x)
@@ -141,10 +151,32 @@ class TestDiagV2API(unittest.TestCase):
         y = paddle.diag(x, padding_value=-8)
         self.assertTrue(np.allclose(y.numpy(), self.expected5))
 
+        x = paddle.to_tensor(self.input_np4)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected6))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected7))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected8))
+
+        x = paddle.to_tensor(self.input_np5)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected9))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected10))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected11))
+
     def run_static(self, use_gpu=False):
         x = paddle.data(name='input', shape=[10, 10], dtype='float32')
         x2 = paddle.data(name='input2', shape=[100], dtype='float64')
         x3 = paddle.data(name='input3', shape=[100], dtype='int64')
+        x4 = paddle.data(name='input4', shape=[2000, 2000], dtype='float32')
+        x5 = paddle.data(name='input5', shape=[2000], dtype='float32')
         result0 = paddle.diag(x)
         result1 = paddle.diag(x, offset=1)
         result2 = paddle.diag(x, offset=-1)
@@ -152,17 +184,28 @@ class TestDiagV2API(unittest.TestCase):
         result4 = paddle.diag(x2, padding_value=8)
         result5 = paddle.diag(x3, padding_value=8.0)
         result6 = paddle.diag(x3, padding_value=-8)
+        result7 = paddle.diag(x4)
+        result8 = paddle.diag(x4, offset=1)
+        result9 = paddle.diag(x4, offset=-1)
+        result10 = paddle.diag(x5)
+        result11 = paddle.diag(x5, offset=1)
+        result12 = paddle.diag(x5, offset=-1)
 
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        res0, res1, res2, res4, res5, res6 = exe.run(
+        res0, res1, res2, res4, res5, res6, res7, res8, res9, res10, res11, res12 = exe.run(
             feed={
                 "input": self.input_np,
                 "input2": self.input_np2,
-                'input3': self.input_np3
+                'input3': self.input_np3,
+                'input4': self.input_np4,
+                'input5': self.input_np5
             },
-            fetch_list=[result0, result1, result2, result4, result5, result6])
+            fetch_list=[
+                result0, result1, result2, result4, result5, result6, result7,
+                result8, result9, result10, result11, result12
+            ])
 
         self.assertTrue(np.allclose(res0, self.expected0))
         self.assertTrue(np.allclose(res1, self.expected1))
@@ -171,6 +214,12 @@ class TestDiagV2API(unittest.TestCase):
         self.assertTrue(np.allclose(res4, self.expected3))
         self.assertTrue(np.allclose(res5, self.expected4))
         self.assertTrue(np.allclose(res6, self.expected5))
+        self.assertTrue(np.allclose(res7, self.expected6))
+        self.assertTrue(np.allclose(res8, self.expected7))
+        self.assertTrue(np.allclose(res9, self.expected8))
+        self.assertTrue(np.allclose(res10, self.expected9))
+        self.assertTrue(np.allclose(res11, self.expected10))
+        self.assertTrue(np.allclose(res12, self.expected11))
 
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index a82612b0ed2a6700dd157ddd6263cae2a879c274..7f55e956a94aee79dda07762e953e71807899bff 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -44,16 +44,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
@@ -71,7 +66,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
                 sends += 1
             if op.type == "sgd":
                 sgds += 1
-        self.assertEqual(sends, 7)
+        self.assertEqual(sends, 1)
         self.assertEqual(sgds, 0)
 
         fleet.init_worker()
@@ -89,16 +84,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index b05a53c88bb9154b69640df6c39305a00e3c447b..db3f2afb3668bc1831286f8d13b274895e7632fd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -36,16 +36,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
     def test_gradient_merge_optimizer(self):
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
@@ -63,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
                 sends += 1
             if op.type == "sgd":
                 sgds += 1
-        self.assertEqual(sends, 6)
+        self.assertEqual(sends, 0)
         self.assertEqual(sgds, 0)
 
         fleet.init_worker()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index 3a923dbf3f72e28c64c3f01d22d4d6f2d897f79b..c46d1dc5b0f87262aee8efd4722418be433c98ea 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -34,8 +34,7 @@ import unittest
 import paddle
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
-from paddle.distributed.fleet.base.util_factory import fleet_util
-from paddle.distributed.fleet import fleet
+import paddle.distributed.fleet as fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
 __all__ = ['FleetDistRunnerBase', 'TestFleetBase', 'runtime_main']
@@ -97,7 +96,7 @@ class FleetDistRunnerBase(object):
         self.dump_fields_path = os.getenv("dump_fields_path", "")
         debug = int(os.getenv("Debug", "0"))
         # TODO(update strategy to support dump params)
-        if False:  #debug:
+        if False:  # debug:
             self.strategy.set_debug_opt({
                 "dump_param": self.dump_param,
                 "dump_fields": self.dump_fields,
@@ -372,8 +371,6 @@ def runtime_main(test_class):
     strategy = model.build_strategy(args)
     avg_cost = model.net(args)
     model.build_optimizer(avg_cost, strategy)
-    fleet_util._set_strategy(strategy)
-    fleet_util._set_role_maker(role)
     if args.role == "pserver":
         model.run_pserver(args)
     else:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
index 4d744c8299f484fd60a081adb1b3b9eb2834ddef..ba97c5079bde429b0b7145208926b570d04725bc 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -34,8 +34,7 @@ import unittest
 import paddle
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
-from paddle.distributed.fleet.base.util_factory import fleet_util
-from paddle.distributed.fleet import fleet
+import paddle.distributed.fleet as fleet
 
 __all__ = ['FleetDistHeterRunnerBase', 'TestFleetHeterBase', 'runtime_main']
 
@@ -376,8 +375,6 @@ def runtime_main(test_class):
     strategy = model.build_strategy(args)
     avg_cost = model.net(args)
     model.build_optimizer(avg_cost, strategy)
-    fleet_util._set_strategy(strategy)
-    fleet_util._set_role_maker(role)
 
     if args.role == "pserver" or args.role == "heter_trainer":
         model.run_pserver(args)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index 00301f9b1c61dd12dc993e0b4c735479fe16daed..7f4e5d99e02084f363b71dbb73b80e59d704aa15 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -19,7 +19,6 @@ import os
 import math
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
-from paddle.distributed.fleet.base.util_factory import fleet_util
 from paddle.distributed.fleet import fleet
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index 379bcaf684d53c2c72f6369e72418cdaaaf3ac84..6fe52ba9fe61ad83341ece5c29fcafa89095de82 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -70,15 +70,13 @@ class TestPSPassWithBow(unittest.TestCase):
         q = fluid.layers.data(
             name="query_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        q_emb = fluid.layers.embedding(
+        q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
         # vsum
         q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
@@ -97,15 +95,13 @@ class TestPSPassWithBow(unittest.TestCase):
         pt = fluid.layers.data(
             name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        pt_emb = fluid.layers.embedding(
+        pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
         # vsum
         pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
@@ -123,15 +119,13 @@ class TestPSPassWithBow(unittest.TestCase):
         nt = fluid.layers.data(
             name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        nt_emb = fluid.layers.embedding(
+        nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
         # vsum
         nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
@@ -167,7 +161,7 @@ class TestPSPassWithBow(unittest.TestCase):
 
         fleet.init(role)
         loss, acc, _ = self.net()
-        optimizer = fluid.optimizer.SGD(base_lr)
+        optimizer = fluid.optimizer.Adam(base_lr)
         strategy = StrategyFactory.create_async_strategy()
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index fd069793473648a0dff731d66c85bd3fe61997c7..c570c4d8cd01dd7e7b113b1f5f35c9887f4a4376 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -168,12 +168,13 @@ class TestPSPassWithBow(unittest.TestCase):
         fleet.init(role)
         loss, acc, _ = self.net()
 
-        optimizer = fluid.optimizer.SGD(
+        optimizer = fluid.optimizer.Adagrad(
             learning_rate=fluid.layers.exponential_decay(
                 learning_rate=base_lr,
                 decay_steps=500,
                 decay_rate=0.969,
                 staircase=True))
+
         strategy = StrategyFactory.create_async_strategy()
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
new file mode 100644
index 0000000000000000000000000000000000000000..c09f22f3fc5807ac3a496ff3fe17369963749ca5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        endpoints = [
+            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
+            "127.0.0.1:36007"
+        ]
+
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_num=2,
+            server_endpoints=endpoints)
+
+        fleet.init(role)
+        loss, acc, _ = self.net()
+        optimizer = fluid.optimizer.Adagrad(base_lr)
+        strategy = StrategyFactory.create_async_strategy()
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee099e48eff60252c001b0baaf98e76eaef4b300
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
@@ -0,0 +1,174 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+import paddle
+paddle.enable_static()
+
+
+class TestLookupTableFuseOp(unittest.TestCase):
+    def test_fuse(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = fluid.global_scope()
+        scope.var("LearningRate").get_tensor().set([0.01], place)
+        scope.var("Ids").get_tensor().set([i for i in range(100)], place)
+
+        init_program = fluid.Program()
+
+        lr = init_program.global_block().create_var(
+            name="LearningRate",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        ids = init_program.global_block().create_var(
+            name="Ids",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[100],
+            dtype="int64")
+
+        output = init_program.global_block().create_var(
+            name="output",
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[100, 8],
+            dtype="float32")
+
+        metas = []
+        metas.append(
+            "embedding_1.block0:Param,Moment1,Moment2:8,8,8:0:embedding_1@GRAD.block0:embedding_1.block0,embedding_1_moment1_0,embedding_1_moment2_0,kSparseIDs@embedding_1.block0:uniform_random&0&-0.5&0.5,fill_constant&0.0,fill_constant&0.0:none"
+        )
+        metas.append(
+            "embedding_2.block0:Param:8:0:embedding_2@GRAD.block0:embedding_2.block0,kSparseIDs@embedding_2.block0:uniform_random&0&-0.5&0.5:none"
+        )
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_init",
+            inputs=None,
+            outputs=None,
+            attrs={"large_scale_metas": metas})
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_read",
+            inputs={"Ids": ids},
+            outputs={"Out": output},
+            attrs={
+                "tablename": "embedding_1.block0",
+                "init": True,
+                "value_names": ["Param"],
+            })
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_read",
+            inputs={"Ids": ids},
+            outputs={"Out": output},
+            attrs={
+                "tablename": "embedding_2.block0",
+                "init": True,
+                "value_names": ["Param"],
+            })
+
+        executor = fluid.Executor(place)
+        executor.run(init_program)
+
+        training_program = fluid.Program()
+
+        scope.var('Beta1Pow').get_tensor().set(
+            np.array([0]).astype("float32"), place)
+        scope.var('Beta2Pow').get_tensor().set(
+            np.array([0]).astype("float32"), place)
+
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 8
+        w_selected_rows = scope.var('Grad').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        lr = training_program.global_block().create_var(
+            name="LearningRate",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        grads = training_program.global_block().create_var(
+            name="Grad",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
+            shape=[100, 8],
+            dtype="float32")
+
+        beta1 = training_program.global_block().create_var(
+            name="Beta1Pow",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        beta2 = training_program.global_block().create_var(
+            name="Beta2Pow",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        training_program.global_block().append_op(
+            type="lookup_sparse_table_fuse_adam",
+            inputs={
+                "Grad": grads,
+                "LearningRate": lr,
+                "Beta1Pow": beta1,
+                "Beta2Pow": beta2,
+            },
+            outputs={"Beta1PowOut": beta1,
+                     "Beta2PowOut": beta2},
+            attrs={
+                "is_entry": False,
+                "tablename": "embedding_1.block0",
+                "value_names": ["Param", "Moment1", "Moment2"],
+            })
+
+        training_program.global_block().append_op(
+            type="lookup_sparse_table_fuse_sgd",
+            inputs={"Grad": grads,
+                    "LearningRate": lr},
+            attrs={
+                "is_entry": False,
+                "tablename": "embedding_2.block0",
+                "value_names": ["Param"],
+            })
+
+        executor.run(training_program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..d74d08681c18c0f0f739b4a6f59a4773f0cd38da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistMnist2x2FP16AllReduce(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._nccl2_mode = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist_fp16_allreduce.py", delta=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 45597e7253c4d5bab50aa58f5f58e13e89ce1c1e..ccd57c4d51529f2c178f6b7c6c8f98851b1e9e80 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -107,7 +107,7 @@ class TestFleetBase(unittest.TestCase):
     def test_util(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        self.assertEqual(fleet.util(), None)
+        self.assertNotEqual(fleet.util, None)
 
     def test_barrier_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index b20f33e11b656f1296510df653309a3569d45043..deaf342da12af9ab7d7b0c659961b3ee6c1ee478 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase):
         strategy.dgc = "True"
         self.assertEqual(strategy.dgc, False)
 
+    def test_fp16_allreduce(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.fp16_allreduce = True
+        self.assertEqual(strategy.fp16_allreduce, True)
+        strategy.fp16_allreduce = False
+        self.assertEqual(strategy.fp16_allreduce, False)
+        with self.assertRaises(TypeError):
+            strategy.fp16_allreduce = "True"
+        self.assertEqual(strategy.fp16_allreduce, False)
+
     def test_sync_nccl_allreduce(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.sync_nccl_allreduce = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..efffa9fa88fde709dc90bfa188250fbb7116d4f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import os
+
+paddle.enable_static()
+
+
+class TestFleetFP16CompressOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+
+    def net(self, main_prog, startup_prog, dtype='float32'):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype=dtype)
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.fp16_allreduce = True
+        return avg_cost, strategy
+
+    def test_fp16_allreduce_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        cast_out = [
+            op.output('Out')[0] for op in avg_cost.block.ops
+            if op.type == 'cast'
+        ]
+
+        cast_op_count = 0
+        for name in ops:
+            if name == 'cast':
+                cast_op_count += 1
+        self.assertIn('cast', ops)
+        self.assertEqual(cast_op_count, 12)  # 6 + 6, cast_fp16 + cast_fp32
+
+        for name in cast_out:
+            self.assertIn('cast_fp16', name)
+
+    def test_fp16_allreduce_not_apply_fp16_net(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog, dtype='float16')
+
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('cast', ops)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 4dd254af251ae955878f9846e0f0e06f65c3ec90..992fbbbe2661d7059fc2acccb2b7fda5a357157a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -436,12 +436,12 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         optimizer.minimize(avg_cost)
 
         comm_world = "server"
-        fleet.util().barrier(comm_world)
+        fleet.util.barrier(comm_world)
 
-        gather = fleet.util().all_gather(1, comm_world)
+        gather = fleet.util.all_gather(1, comm_world)
         self.assertEqual(gather[0], 1)
 
-        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        all_reduce = fleet.util.all_reduce(1, "sum", comm_world)
         self.assertEqual(1, all_reduce)
 
         self.clean(tmp)
@@ -752,12 +752,12 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         optimizer.minimize(avg_cost)
 
         comm_world = "server"
-        fleet.util().barrier(comm_world)
+        fleet.util.barrier(comm_world)
 
-        gather = fleet.util().all_gather(1, comm_world)
+        gather = fleet.util.all_gather(1, comm_world)
         self.assertEqual(gather[0], 1)
 
-        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        all_reduce = fleet.util.all_reduce(1, "sum", comm_world)
         self.assertEqual(1, all_reduce)
 
         self.clean(tmp)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index 1570912e7406f930212eead64305e1e35e1b8ac0..b5c22b192a16dec43c5e5243c34b970f489f5bb7 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -22,7 +22,6 @@ import tempfile
 import os
 import sys
 from paddle.dataset.common import download, DATA_HOME
-from paddle.distributed.fleet.base.util_factory import fleet_util
 import paddle.distributed.fleet.base.role_maker as role_maker
 
 
@@ -59,8 +58,7 @@ class TestFleetUtil(unittest.TestCase):
         import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        default_util = fleet.util()
-        self.assertEqual(default_util, None)
+        self.assertNotEqual(fleet.util, None)
 
     def test_set_user_defined_util(self):
         import paddle.distributed.fleet as fleet
@@ -76,17 +74,19 @@ class TestFleetUtil(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
-        fleet.set_util(my_util)
-        user_id = fleet.util().get_user_id()
+        fleet.util = my_util
+        user_id = fleet.util.get_user_id()
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
-        from paddle.distributed.fleet.utils.fs import LocalFS
+        import paddle.distributed.fleet as fleet
+        from paddle.distributed.fleet.utils import LocalFS
+
         fs = LocalFS()
         dirs, files = fs.ls_dir("test_tmp")
         dirs, files = fs.ls_dir("./")
         self.assertFalse(fs.need_upload_download())
-        fleet_util._set_file_system(fs)
+        fleet.util._set_file_system(fs)
 
     def download_files(self):
         path = download(self.proto_data_url, self.module_name,
@@ -98,7 +98,8 @@ class TestFleetUtil(unittest.TestCase):
         return unzip_folder
 
     def test_get_file_shard(self):
-        self.assertRaises(Exception, fleet_util.get_file_shard, "files")
+        import paddle.distributed.fleet as fleet
+        self.assertRaises(Exception, fleet.util.get_file_shard, "files")
         try:
             import netifaces
         except:
@@ -112,18 +113,20 @@ class TestFleetUtil(unittest.TestCase):
             role=role_maker.Role.WORKER,
             worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
             server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-        fleet_util._set_role_maker(role)
-        files = fleet_util.get_file_shard(["1", "2", "3"])
+        fleet.init(role)
+
+        files = fleet.util.get_file_shard(["1", "2", "3"])
         self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
 
     def test_program_type_trans(self):
+        import paddle.distributed.fleet as fleet
         data_dir = self.download_files()
         program_dir = os.path.join(data_dir, self.pruned_dir)
         text_program = "pruned_main_program.pbtxt"
         binary_program = "pruned_main_program.bin"
-        text_to_binary = fleet_util._program_type_trans(program_dir,
+        text_to_binary = fleet.util._program_type_trans(program_dir,
                                                         text_program, True)
-        binary_to_text = fleet_util._program_type_trans(program_dir,
+        binary_to_text = fleet.util._program_type_trans(program_dir,
                                                         binary_program, False)
         self.assertTrue(
             os.path.exists(os.path.join(program_dir, text_to_binary)))
@@ -131,6 +134,7 @@ class TestFleetUtil(unittest.TestCase):
             os.path.exists(os.path.join(program_dir, binary_to_text)))
 
     def test_prams_check(self):
+        import paddle.distributed.fleet as fleet
         data_dir = self.download_files()
 
         class config:
@@ -160,11 +164,11 @@ class TestFleetUtil(unittest.TestCase):
         # test saved var's shape
         conf.dump_program_filename = "pruned_main_program.save_var_shape_not_match"
 
-        self.assertRaises(Exception, fleet_util._params_check)
+        self.assertRaises(Exception, fleet.util._params_check)
 
         # test program.proto without feed_op and fetch_op
         conf.dump_program_filename = "pruned_main_program.no_feed_fetch"
-        results = fleet_util._params_check(conf)
+        results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
             results[0], np.array(
@@ -172,11 +176,11 @@ class TestFleetUtil(unittest.TestCase):
 
         # test feed_var's shape
         conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
-        self.assertRaises(Exception, fleet_util._params_check)
+        self.assertRaises(Exception, fleet.util._params_check)
 
         # test correct case with feed_vars_filelist
         conf.dump_program_filename = "pruned_main_program.pbtxt"
-        results = fleet_util._params_check(conf)
+        results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
             results[0], np.array(
@@ -186,13 +190,14 @@ class TestFleetUtil(unittest.TestCase):
         conf.feed_config.feeded_vars_filelist = None
         # test feed var with lod_level >= 2
         conf.dump_program_filename = "pruned_main_program.feed_lod2"
-        self.assertRaises(Exception, fleet_util._params_check)
+        self.assertRaises(Exception, fleet.util._params_check)
 
         conf.dump_program_filename = "pruned_main_program.pbtxt"
-        results = fleet_util._params_check(conf)
+        results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
 
     def test_proto_check(self):
+        import paddle.distributed.fleet as fleet
         data_dir = self.download_files()
 
         class config:
@@ -210,7 +215,7 @@ class TestFleetUtil(unittest.TestCase):
                          "pruned_main_program.save_var_shape_not_match"))
         conf.is_text_pruned_program = True
         conf.draw = False
-        res = fleet_util._proto_check(conf)
+        res = fleet.util._proto_check(conf)
         self.assertFalse(res)
 
         # test match
@@ -222,10 +227,11 @@ class TestFleetUtil(unittest.TestCase):
         else:
             conf.draw = True
             conf.draw_out_name = "pruned_check"
-        res = fleet_util._proto_check(conf)
+        res = fleet.util._proto_check(conf)
         self.assertTrue(res)
 
     def test_visualize(self):
+        import paddle.distributed.fleet as fleet
         if sys.platform == 'win32' or sys.platform == 'sys.platform':
             pass
         else:
@@ -234,10 +240,10 @@ class TestFleetUtil(unittest.TestCase):
                 data_dir,
                 os.path.join(self.train_dir, "join_main_program.pbtxt"))
             is_text = True
-            program = fleet_util._load_program(program_path, is_text)
+            program = fleet.util._load_program(program_path, is_text)
             output_dir = os.path.join(data_dir, self.train_dir)
             output_filename = "draw_prog"
-            fleet_util._visualize_graphviz(program, output_dir, output_filename)
+            fleet.util._visualize_graphviz(program, output_dir, output_filename)
             self.assertTrue(
                 os.path.exists(
                     os.path.join(output_dir, output_filename + ".dot")))
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bc305cd1f4dcd3faaaf8ccbe813bdf08e966d6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle core is not compiled with CUDA")
+class TestFusedBnAddActAPI(unittest.TestCase):
+    def setUp(self):
+        self.conv_param_attr1 = fluid.ParamAttr(
+            name='conv2d_1.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.conv_param_attr2 = fluid.ParamAttr(
+            name='conv2d_2.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.bn_param_attr1 = fluid.ParamAttr(
+            name='batch_norm_w_1',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr1 = fluid.ParamAttr(
+            name='batch_norm_b_1',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.bn_param_attr2 = fluid.ParamAttr(
+            name='batch_norm_w_2',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr2 = fluid.ParamAttr(
+            name='batch_norm_b_2',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.fc_param_attr = fluid.ParamAttr(
+            name='fc.weight',
+            initializer=fluid.initializer.Xavier(uniform=False))
+
+    def build_fused_program(self,
+                            main_program,
+                            startup_program,
+                            use_cuda,
+                            seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(
+                conv1_2,
+                bn,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2)
+            prediction = fluid.layers.fc(input=fused_bn_add_act,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def build_origin_program(self,
+                             main_program,
+                             startup_program,
+                             use_cuda,
+                             seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn1 = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            bn2 = fluid.layers.batch_norm(
+                input=conv1_2,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2,
+                act=None,
+                data_layout='NHWC')
+            out = bn1 + bn2
+            out = fluid.layers.relu(out)
+            prediction = fluid.layers.fc(input=out,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        iters = 5
+        batch_size = 16
+
+        # build_fused_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_fused_program(main_program, startup_program,
+                                              use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        exe = fluid.Executor(place)
+        loss_vals_fused = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals_fused.append(loss_v[0][0])
+
+        # build_origin_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_origin_program(main_program, startup_program,
+                                               use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        loss_vals = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals.append(loss_v[0][0])
+
+        # check loss
+        for i in range(iters):
+            self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
+
+    def test_fuse_bn_add_act(self):
+        place = fluid.CUDAPlace(0)
+        self.check(place, use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index a46b9b0ca78bf37e1c421a08a6fa8c5353c6d45d..833eeb33641c90206b4f85a9bf49ac57f5c8c8b1 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase):
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
-            self.assertTrue(np.allclose(y1, y2))
+            result = np.allclose(y1, y2)
+            if not result:
+                print("y1:", y1, "\ty2:", y2)
+            self.assertTrue(result)
             test_weight_bias_false()
 
     def test_static(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 22e19efcb58d19c41835565de2c8c01fe253702a..bee53fd10f5feef911ce8ed105c7792cbd664a15 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -292,7 +292,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.save(self.state_dict, "./test_dy")
+            fluid.save_dygraph(self.state_dict, "./test_dy")
 
     def testLoadAndSetVarBase(self):
         seed = 90
@@ -373,7 +373,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            para_state_dict, opti_state_dict = paddle.load("./test_dy")
+            para_state_dict, opti_state_dict = fluid.load_dygraph("./test_dy")
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -898,31 +898,31 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'))
 
             self.assertTrue(opti_state_dict == None)
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
     def test_load_compatible_with_keep_name_table(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'), True)
             self.assertTrue(para_state_dict != None)
             self.assertTrue(opti_state_dict == None)
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
             self.assertTrue(para_state_dict != None)
             self.assertTrue(opti_state_dict == None)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 3eb413a62664057c56567d5834b216110fac04fb..5b7998198efa83a77c383d95d080a6e8cb30067f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -285,7 +285,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 else:
                     self.base_opti[k] = v
 
-            fluid.save_dygraph(self.opti_dict, "./test_dy_v2")
+            paddle.save(self.opti_dict, "./test_dy_v2.pdopt")
 
             self.state_dict = ptb_model.state_dict()
 
@@ -294,7 +294,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.save(self.state_dict, "./test_dy_v2")
+            paddle.save(self.state_dict, "./test_dy_v2.pdparams")
 
     def testLoadAndSetVarBase(self):
         self.setUp()
@@ -374,7 +374,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                     self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
 
-            para_state_dict, opti_state_dict = paddle.load("./test_dy_v2")
+            para_state_dict = paddle.load("./test_dy_v2.pdparams")
+            opti_state_dict = paddle.load("./test_dy_v2.pdopt")
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -905,26 +906,19 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy'))
-
-            self.assertTrue(opti_state_dict == None)
-
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdopt'))
-
     def test_no_state_in_input_dict(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdparams'))
             para_state_dict.pop('weight')
 
             emb.set_state_dict(para_state_dict)
@@ -933,9 +927,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdparams'))
             para_state_dict['weight'] = np.expand_dims(
                 para_state_dict['weight'], axis=-1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index a04e1e4e5aafeeb605348b30125c5d42b3171674..e47a70054be4137aba9f4148e4cdc224b3e60260 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
+if fluid.is_compiled_with_cuda():
+    fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
 batch_size = 8
 train_parameters = {
     "input_size": [3, 224, 224],
@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase):
                     label.stop_gradient = True
 
                     out = se_resnext(img)
-                    loss = fluid.layers.cross_entropy(input=out, label=label)
+                    softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                    loss = fluid.layers.cross_entropy(
+                        input=softmax_out, label=label)
                     avg_loss = fluid.layers.mean(x=loss)
 
                     dy_out = avg_loss.numpy()
@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase):
                 name='pixel', shape=[3, 224, 224], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = se_resnext(img)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
+            softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+            loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
             avg_loss = fluid.layers.mean(x=loss)
             optimizer.minimize(avg_loss)
 
@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase):
                         static_grad_value[static_grad_name_list[
                             i - grad_start_pos]] = out[i]
 
-        self.assertTrue(np.allclose(static_out, dy_out))
+        self.assertTrue(
+            np.allclose(static_out, dy_out),
+            "\nstatic_out: {}\ndy_out: {}".format(static_out, dy_out))
 
         self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
 
@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase):
         self.assertEqual(len(dy_grad_value), len(static_grad_value))
 
         for key, value in six.iteritems(static_grad_value):
-            self.assertTrue(np.allclose(value, dy_grad_value[key]))
+            self.assertTrue(
+                np.allclose(value, dy_grad_value[key]),
+                "\nstatic_grad_value: {}\ndy_grad_value: {}".format(
+                    value, dy_grad_value[key]))
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
         self.assertEqual(len(dy_param_value), len(static_param_value))
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(
+                np.allclose(value, dy_param_value[key]),
+                "\nstatic_param_value: {}\ndy_param_value: {}".format(
+                    value, dy_param_value[key]))
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
index b47834ffab85e56b12c787ac57823ee02dd18df7..a0da4b0efee64e6a7aa26a213419764afa90bd22 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@@ -49,7 +49,7 @@ class TestDygraphDataLoaderSingalHandler(unittest.TestCase):
             test_process.start()
 
             set_child_signal_handler(id(self), test_process.pid)
-            time.sleep(3)
+            time.sleep(5)
         except core.EnforceNotMet as ex:
             self.assertIn("FatalError", cpt.get_exception_message(ex))
             exception = ex
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index a1a9b3f444fa411f90e869f5265fa0933393ff56..fdc1e6b52aba1d5189a759d748ad198c080b4609 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -124,52 +124,67 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        load_param_dict, _ = paddle.load(self.save_dirname)
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_with_model_filename(self):
         self.save_dirname = "static_mnist.load_state_dict.model_filename"
         self.model_filename = "static_mnist.model"
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.separate_params = True
-        configs.model_filename = self.model_filename
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        config = paddle.SaveLoadConfig()
+        config.separate_params = True
+        config.model_filename = self.model_filename
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname, config)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_with_param_filename(self):
         self.save_dirname = "static_mnist.load_state_dict.param_filename"
         self.model_filename = None
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.params_filename = self.params_filename
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        config = paddle.SaveLoadConfig()
+        config.params_filename = self.params_filename
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname, config)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_with_model_and_param_filename(self):
         self.save_dirname = "static_mnist.load_state_dict.model_and_param_filename"
         self.model_filename = "static_mnist.model"
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.params_filename = self.params_filename
-        configs.model_filename = self.model_filename
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        config = paddle.SaveLoadConfig()
+        config.params_filename = self.params_filename
+        config.model_filename = self.model_filename
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname, config)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_state_dict_from_save_params(self):
         self.save_dirname = "static_mnist.load_state_dict.save_params"
         self.params_filename = None
         orig_param_dict = self.train_and_save_model(True)
 
-        load_param_dict, _ = paddle.load(self.save_dirname)
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index a70862f40197c513a0cd04753553264708ee2a1c..5df04ddfc3d26492323153b8b26658db4325b7ec 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -19,6 +19,7 @@ import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
+import inspect
 
 
 class TestMathOpPatchesVarBase(unittest.TestCase):
@@ -302,21 +303,13 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertEqual(x.dim(), 2)
         self.assertEqual(x.ndimension(), 2)
         self.assertEqual(x.ndim, 2)
-        self.assertEqual(x.size(), [2, 3])
-        self.assertTrue(
-            np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
-            )))
-        self.assertTrue(
-            np.array_equal(x.log_sigmoid().numpy(),
-                           fluid.layers.logsigmoid(x).numpy()))
+        self.assertEqual(x.size, 6)
+        self.assertEqual(x.numel(), 6)
         self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
         self.assertTrue(
             np.array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()))
         self.assertTrue(
             np.array_equal(x.atan().numpy(), paddle.atan(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.tanh_shrink().numpy(),
-                           fluid.layers.tanh_shrink(x).numpy()))
         self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy()))
         m = x.abs()
         self.assertTrue(
@@ -344,12 +337,6 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             )))
         self.assertTrue(
             np.array_equal(x.square().numpy(), paddle.square(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.softplus().numpy(),
-                           fluid.layers.softplus(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.softsign().numpy(),
-                           fluid.layers.softsign(x).numpy()))
         self.assertTrue(
             np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
         self.assertTrue(
@@ -422,6 +409,8 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertTrue(np.array_equal(x.reciprocal(), paddle.reciprocal(x)))
 
         # 2. Binary operation
+        self.assertTrue(
+            np.array_equal(x.divide(y).numpy(), paddle.divide(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
                 x.matmul(y, True, False).numpy(),
@@ -501,6 +490,73 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(
                 x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+        a = paddle.to_tensor([[1, 2], [3, 4]])
+        b = paddle.to_tensor([[4, 3], [2, 1]])
+        self.assertTrue(
+            np.array_equal(
+                x.where(a, b).numpy(), paddle.where(x, a, b).numpy()))
+
+        self.assertTrue(inspect.ismethod(a.dot))
+        self.assertTrue(inspect.ismethod(a.elementwise_add))
+        self.assertTrue(inspect.ismethod(a.elementwise_div))
+        self.assertTrue(inspect.ismethod(a.elementwise_floordiv))
+        self.assertTrue(inspect.ismethod(a.elementwise_mod))
+        self.assertTrue(inspect.ismethod(a.elementwise_sub))
+        self.assertTrue(inspect.ismethod(a.logsumexp))
+        self.assertTrue(inspect.ismethod(a.multiplex))
+        self.assertTrue(inspect.ismethod(a.prod))
+        self.assertTrue(inspect.ismethod(a.reduce_max))
+        self.assertTrue(inspect.ismethod(a.reduce_min))
+        self.assertTrue(inspect.ismethod(a.reduce_prod))
+        self.assertTrue(inspect.ismethod(a.reduce_sum))
+        self.assertTrue(inspect.ismethod(a.scale))
+        self.assertTrue(inspect.ismethod(a.stanh))
+        self.assertTrue(inspect.ismethod(a.sums))
+        self.assertTrue(inspect.ismethod(a.elementwise_sum))
+        self.assertTrue(inspect.ismethod(a.max))
+        self.assertTrue(inspect.ismethod(a.maximum))
+        self.assertTrue(inspect.ismethod(a.min))
+        self.assertTrue(inspect.ismethod(a.minimum))
+        self.assertTrue(inspect.ismethod(a.floor_divide))
+        self.assertTrue(inspect.ismethod(a.remainder))
+        self.assertTrue(inspect.ismethod(a.floor_mod))
+        self.assertTrue(inspect.ismethod(a.multiply))
+        self.assertTrue(inspect.ismethod(a.logsumexp))
+        self.assertTrue(inspect.ismethod(a.inverse))
+        self.assertTrue(inspect.ismethod(a.log1p))
+        self.assertTrue(inspect.ismethod(a.erf))
+        self.assertTrue(inspect.ismethod(a.addcmul))
+        self.assertTrue(inspect.ismethod(a.addmm))
+        self.assertTrue(inspect.ismethod(a.clip))
+        self.assertTrue(inspect.ismethod(a.trace))
+        self.assertTrue(inspect.ismethod(a.kron))
+        self.assertTrue(inspect.ismethod(a.isinf))
+        self.assertTrue(inspect.ismethod(a.isnan))
+        self.assertTrue(inspect.ismethod(a.concat))
+        self.assertTrue(inspect.ismethod(a.broadcast_to))
+        self.assertTrue(inspect.ismethod(a.scatter_nd_add))
+        self.assertTrue(inspect.ismethod(a.scatter_nd))
+        self.assertTrue(inspect.ismethod(a.shard_index))
+        self.assertTrue(inspect.ismethod(a.chunk))
+        self.assertTrue(inspect.ismethod(a.stack))
+        self.assertTrue(inspect.ismethod(a.strided_slice))
+        self.assertTrue(inspect.ismethod(a.unsqueeze))
+        self.assertTrue(inspect.ismethod(a.unstack))
+        self.assertTrue(inspect.ismethod(a.argmax))
+        self.assertTrue(inspect.ismethod(a.argmin))
+        self.assertTrue(inspect.ismethod(a.argsort))
+        self.assertTrue(inspect.ismethod(a.has_inf))
+        self.assertTrue(inspect.ismethod(a.has_nan))
+        self.assertTrue(inspect.ismethod(a.masked_select))
+        self.assertTrue(inspect.ismethod(a.topk))
+        self.assertTrue(inspect.ismethod(a.index_select))
+        self.assertTrue(inspect.ismethod(a.nonzero))
+        self.assertTrue(inspect.ismethod(a.sort))
+        self.assertTrue(inspect.ismethod(a.index_sample))
+        self.assertTrue(inspect.ismethod(a.mean))
+        self.assertTrue(inspect.ismethod(a.reduce_mean))
+        self.assertTrue(inspect.ismethod(a.std))
+        self.assertTrue(inspect.ismethod(a.numel))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 884139a23d51c95c79439b91d501dc935baeae36..640771df23b726bd0a8a36b168bc5428fd953c45 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
+
+    def init_kernel_type(self):
         self.dtype = "float64"
 
     def setUp(self):
+        self.init_kernel_type()
         self.config()
         self.op_type = "matmul_v2"
         x = np.random.random(self.x_shape).astype(self.dtype)
         y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
         result = reference_matmul(x, y, self.trans_x, self.trans_y)
-
+        result = result.astype(self.dtype)
         self.inputs = {
             'X': x,
             'Y': y,
@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
         self.y_shape = (1, 3, 2, 100)
         self.trans_x = False
         self.trans_y = True
-        self.dtype = "float64"
 
 
 class TestMatMuklOp3(TestMatMulV2Op):
@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp4(TestMatMulV2Op):
@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
         self.y_shape = (1, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp5(TestMatMulV2Op):
@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (1, 1, 100, 2)
+        self.x_shape = (1, 1, 100, 1)
         self.y_shape = (100, )
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp6(TestMatMulV2Op):
@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
         self.y_shape = (100, )
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp7(TestMatMulV2Op):
@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp8(TestMatMulV2Op):
@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp9(TestMatMulV2Op):
@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
         self.y_shape = (2, 1, 2, 100)
         self.trans_x = False
         self.trans_y = True
-        self.dtype = "float64"
 
 
 class TestMatMuklOp10(TestMatMulV2Op):
@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (1, 1, 2, 100)
-        self.y_shape = (1, 2, 100, 2)
+        self.x_shape = (1, 1, 25, 4)
+        self.y_shape = (1, 2, 4, 25)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp11(TestMatMulV2Op):
@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp12(TestMatMulV2Op):
@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (2, 1, 100, 2)
-        self.y_shape = (1, 1, 100, 2)
+        self.x_shape = (2, 1, 4, 25)
+        self.y_shape = (1, 1, 4, 25)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp13(TestMatMulV2Op):
@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (2, 2, 100, 2)
-        self.y_shape = (2, 2, 100, 2)
+        self.x_shape = (2, 2, 2, 50)
+        self.y_shape = (2, 2, 2, 50)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp14(TestMatMulV2Op):
@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
         self.y_shape = (1, 2, 2, 100, 2)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp15(TestMatMulV2Op):
@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
         self.y_shape = (1, 2, 2, 100, 1)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp16(TestMatMulV2Op):
@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
 
     def config(self):
         self.x_shape = (100)
-        self.y_shape = (1, 2, 2, 100, 1)
+        self.y_shape = (1, 2, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp17(TestMatMulV2Op):
@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
         self.y_shape = (100)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
+
+
+#--------------------test matmul fp16--------------------
+
+
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(
+                    place, ['X', 'Y'],
+                    'Out',
+                    max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulV2Op)
+create_test_fp16_class(TestMatMuklOp2)
+create_test_fp16_class(TestMatMuklOp3)
+create_test_fp16_class(TestMatMuklOp4)
+create_test_fp16_class(TestMatMuklOp5)
+create_test_fp16_class(TestMatMuklOp6)
+create_test_fp16_class(TestMatMuklOp7)
+create_test_fp16_class(TestMatMuklOp8)
+create_test_fp16_class(TestMatMuklOp9)
+create_test_fp16_class(TestMatMuklOp10)
+create_test_fp16_class(TestMatMuklOp11)
+create_test_fp16_class(TestMatMuklOp12)
+create_test_fp16_class(TestMatMuklOp13)
+create_test_fp16_class(TestMatMuklOp14)
+create_test_fp16_class(TestMatMuklOp15)
+create_test_fp16_class(TestMatMuklOp16)
+create_test_fp16_class(TestMatMuklOp17)
 
 
 class TestMatMulV2API(unittest.TestCase):
@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
                 y = paddle.to_tensor(input_y)
                 result = paddle.matmul(x, y)
 
+    def test_dygraph_fp16(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    input_x = np.random.random([4, 3]).astype("float16")
+                    input_y = np.random.random([3, 4]).astype("float16")
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_minimum_op.py b/python/paddle/fluid/tests/unittests/test_minimum_op.py
index 4c08b7386ca2c5da04c0a289872dacf68a2ea040..a0673c82c5b341e550485ebdcee4e4616693d641 100644
--- a/python/paddle/fluid/tests/unittests/test_minimum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
@@ -61,8 +61,8 @@ class ApiMinimumTest(unittest.TestCase):
     def test_dynamic_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         z = paddle.minimum(x, y)
         np_z = z.numpy()
         z_expected = np.array(np.minimum(self.input_x, self.input_y))
@@ -73,8 +73,8 @@ class ApiMinimumTest(unittest.TestCase):
         np_x = np.random.rand(5, 4, 3, 2).astype("float64")
         np_y = np.random.rand(4, 3).astype("float64")
 
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         result_1 = paddle.minimum(x, y, axis=1)
         result_2 = paddle.minimum(x, y, axis=-2)
         self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index 753d96c44114a552f4bdd299602d7f13f672efbf..e327307e955308e78f6e9640681c842060a34882 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -205,8 +205,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'mean')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'mean')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -240,8 +239,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'sum')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'sum')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -275,8 +273,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'none')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'none')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
index 0706eb53d537da58a5a248e060759b748b30af19..1bb720673e4f33ac7a866cdf73a885741ee08e7e 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
             dataloader = DataLoader(
                 dataset,
-                places=places,
                 num_workers=num_workers,
                 batch_size=BATCH_SIZE,
                 drop_last=True)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
index 8f0209406fdff1d4f7659b15d5e6bd8af74fd0f3..af332d8e43209251c4d3751f2689266f0fcd6c1e 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
             dataloader = DataLoader(
                 dataset,
-                places=places,
                 num_workers=num_workers,
                 batch_size=BATCH_SIZE,
                 drop_last=True)
diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py
index e7154193beaf788a9d20f3c131b1df3420918266..c07bf949af39e38222b05394f65977c7027e2f13 100644
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
@@ -909,8 +909,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, )).astype(np.float64)
                 label_np = np.random.randint(0, 10, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 nll_loss = paddle.nn.loss.NLLLoss()
                 res = nll_loss(x, label)
 
@@ -933,8 +933,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, 3)).astype(np.float64)
                 label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 nll_loss = paddle.nn.loss.NLLLoss(reduction='')
                 res = nll_loss(x, label)
 
@@ -957,8 +957,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, 3)).astype(np.float64)
                 label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 res = paddle.nn.functional.nll_loss(x, label, reduction='')
 
         self.assertRaises(
diff --git a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
index 0ebe769fb9bce1aee8412ccebc216c2c85e97775..8ee3b2ac20320c3b82eb7bb81509a9a84ce959a7 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
@@ -101,9 +101,9 @@ def create_test_case(margin, reduction):
 
         def run_dynamic_functional_api(self, place):
             paddle.disable_static(place)
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(self.label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(self.label_data)
 
             result = paddle.nn.functional.margin_ranking_loss(x, y, label,
                                                               margin, reduction)
@@ -117,9 +117,9 @@ def create_test_case(margin, reduction):
 
         def run_dynamic_api(self, place):
             paddle.disable_static(place)
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(self.label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(self.label_data)
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
@@ -134,9 +134,9 @@ def create_test_case(margin, reduction):
         def run_dynamic_broadcast_api(self, place):
             paddle.disable_static(place)
             label_data = np.random.choice([-1, 1], size=[10]).astype("float64")
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(label_data)
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
index d52a1f5d5b16ca7e0d58230a1a17624e5bff0b02..90132a0923df716e9e2a0224671006cb62c1bba0 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -56,7 +56,7 @@ class TestNNSigmoidAPI(unittest.TestCase):
 
     def check_dynamic_api(self, place):
         paddle.disable_static(place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         mysigmoid = nn.Sigmoid()
         y = mysigmoid(x)
         self.assertTrue(np.allclose(y.numpy(), self.y))
@@ -94,7 +94,7 @@ class TestNNFunctionalSigmoidAPI(unittest.TestCase):
 
     def check_dynamic_api(self):
         paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         y = functional.sigmoid(x)
         self.assertTrue(np.allclose(y.numpy(), self.y))
 
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index a89b9fde7f92de0d493ad87a2f0950548ba8ff98..cb4bd16ce219f8a649716d8efff07eb82d5fffc4 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
         self.shape = [2, 2, 3, 4, 5]
 
 
+class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            dtype = "float32"
+            eps = 0.005
+            atol = 2e-4
+            chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[
+                -1]
+            x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
+            z = fluid.layers.batch_norm(
+                input=x,
+                data_layout=self.data_layout,
+                use_global_stats=self.use_global_stats)
+            x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
+            w, b = prog.global_block().all_parameters()[1:3]
+            w_arr = np.ones(chn).astype(dtype)
+            b_arr = np.zeros(chn).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x, w, b],
+                z,
+                x_init=[x_arr, w_arr, b_arr],
+                atol=atol,
+                place=place,
+                eps=eps)
+
+
+class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index 8512bc99e7451c73e5513b834fb6aa448717c646..800706e5965dffedadb61c384d946c8ed28bf826 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -76,8 +76,8 @@ class TestNumelOoAPI(unittest.TestCase):
         paddle.disable_static(paddle.CPUPlace())
         input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
         input_2 = np.random.random([1, 4, 5]).astype("int32")
-        x_1 = paddle.to_variable(input_1)
-        x_2 = paddle.to_variable(input_2)
+        x_1 = paddle.to_tensor(input_1)
+        x_2 = paddle.to_tensor(input_2)
         out_1 = paddle.numel(x_1)
         out_2 = paddle.numel(x_2)
         assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index c1e6a3377710f98184e9541e287b911def89cd81..bb0d6f07bdbde18d155b66c7d014503747ebd887 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -63,7 +63,7 @@ class TestOnesLikeImpeartive(unittest.TestCase):
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
-        x = paddle.to_variable(np.ones(shape))
+        x = paddle.to_tensor(np.ones(shape))
         for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
             out = ones_like(x, dtype)
             self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee3494558604fb00f767261c06d4b3612e62ad0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+
+BATCH_SIZE = 16
+BATCH_NUM = 4
+EPOCH_NUM = 4
+SEED = 10
+
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+def random_batch_reader():
+    def _get_random_inputs_and_labels():
+        np.random.seed(SEED)
+        image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (
+            BATCH_SIZE,
+            1, )).astype('int64')
+        return image, label
+
+    def __reader__():
+        for _ in range(BATCH_NUM):
+            batch_image, batch_label = _get_random_inputs_and_labels()
+            batch_image = paddle.to_tensor(batch_image)
+            batch_label = paddle.to_tensor(batch_label)
+            yield batch_image, batch_label
+
+    return __reader__
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+    def forward(self, x):
+        return self._linear(x)
+
+
+def train(layer, loader, loss_fn, opt):
+    for epoch_id in range(EPOCH_NUM):
+        for batch_id, (image, label) in enumerate(loader()):
+            out = layer(image)
+            loss = loss_fn(out, label)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+
+
+class TestSaveLoad(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        paddle.disable_static()
+
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def build_and_train_model(self):
+        # create network
+        layer = LinearNet()
+        loss_fn = nn.CrossEntropyLoss()
+
+        adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
+        # create data loader
+        # TODO: using new DataLoader cause unknown Timeout on windows, replace it
+        loader = random_batch_reader()
+
+        # train
+        train(layer, loader, loss_fn, adam)
+
+        return layer, adam
+
+    def check_load_state_dict(self, orig_dict, load_dict):
+        for var_name, value in orig_dict.items():
+            self.assertTrue(np.array_equal(value.numpy(), load_dict[var_name]))
+
+    def test_save_load(self):
+        layer, opt = self.build_and_train_model()
+
+        # save
+        layer_save_path = "test_paddle_save_load.linear.pdparams"
+        opt_save_path = "test_paddle_save_load.linear.pdopt"
+        layer_state_dict = layer.state_dict()
+        opt_state_dict = opt.state_dict()
+
+        paddle.save(layer_state_dict, layer_save_path)
+        paddle.save(opt_state_dict, opt_save_path)
+
+        # load
+        load_layer_state_dict = paddle.load(layer_save_path)
+        load_opt_state_dict = paddle.load(opt_save_path)
+
+        self.check_load_state_dict(layer_state_dict, load_layer_state_dict)
+        self.check_load_state_dict(opt_state_dict, load_opt_state_dict)
+
+        # test save load in static mode
+        paddle.enable_static()
+        static_save_path = "static_mode_test/test_paddle_save_load.linear.pdparams"
+        paddle.save(layer_state_dict, static_save_path)
+        load_static_state_dict = paddle.load(static_save_path)
+        self.check_load_state_dict(layer_state_dict, load_static_state_dict)
+
+        # error test cases, some tests relay base test above
+        # 1. test save obj not dict error
+        test_list = [1, 2, 3]
+        with self.assertRaises(NotImplementedError):
+            paddle.save(test_list, "not_dict_error_path")
+
+        # 2. test save path format error
+        with self.assertRaises(ValueError):
+            paddle.save(layer_state_dict, "test_paddle_save_load.linear.model/")
+
+        # 3. test load path not exist error
+        with self.assertRaises(ValueError):
+            paddle.load("test_paddle_save_load.linear.params")
+
+        # 4. test load old save path error
+        with self.assertRaises(ValueError):
+            paddle.load("test_paddle_save_load.linear")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
index baf0efa6ec2e7edafb8d331423a7b47155283c21..cf138e67726163d3d1c990a180fa229b88fed99f 100644
--- a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -48,8 +48,8 @@ def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
 
 def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
     paddle.disable_static()
-    x = paddle.to_variable(x_np)
-    y = paddle.to_variable(y_np)
+    x = paddle.to_tensor(x_np)
+    y = paddle.to_tensor(y_np)
     dist = paddle.nn.layer.distance.PairwiseDistance(
         p=p, epsilon=epsilon, keepdim=keepdim)
     distance = dist(x, y)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index a2f1d77460819a933e8ab536d0ebc7dd32bedf3f..da7e30ff10643278eb355c65157a02a8fae6cff3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -124,8 +124,10 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc_with_new_strategy(self):
         # use_cuda, use_reduce
-        self._compare_reduce_and_allreduce(simple_fc_net, True)
-        self._compare_reduce_and_allreduce(simple_fc_net, False)
+        # NOTE: the computation result of nccl_reduce is non-deterministic,
+        # related issue: https://github.com/NVIDIA/nccl/issues/157
+        self._compare_reduce_and_allreduce(simple_fc_net, True, 1e-5, 1e-2)
+        self._compare_reduce_and_allreduce(simple_fc_net, False, 1e-5, 1e-2)
 
     def check_simple_fc_parallel_accuracy(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
@@ -179,7 +181,7 @@ class TestMNIST(TestParallelExecutorBase):
         # NOTE: the computation result of nccl_reduce is non-deterministic,
         # related issue: https://github.com/NVIDIA/nccl/issues/157
         self._compare_reduce_and_allreduce(fc_with_batchnorm, True, 1e-5, 1e-2)
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, False, 1e-5, 1e-2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sort_op.py b/python/paddle/fluid/tests/unittests/test_sort_op.py
index 015b72fd1c5275f758a109451110f61b97c4a0c7..366e0c7a3fa3ee714162e6041aa0d52dbfb30746 100644
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
@@ -72,14 +72,14 @@ class TestSortDygraph(unittest.TestCase):
 
     def test_api_0(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
+        var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x)
         self.assertEqual((np.sort(self.input_data) == out.numpy()).all(), True)
         paddle.enable_static()
 
     def test_api_1(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
+        var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x, axis=-1)
         self.assertEqual(
             (np.sort(
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index 5aaf31993448ab0ff0c69f648cfa84c62d3e198b..b0f065a26a006ee3553a84938fb5b6b2db7b3172 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -230,13 +230,13 @@ class TestTileAPI(unittest.TestCase):
     def test_api(self):
         with fluid.dygraph.guard():
             np_x = np.random.random([12, 14]).astype("float32")
-            x = paddle.to_variable(np_x)
+            x = paddle.to_tensor(np_x)
 
             positive_2 = np.array([2]).astype("int32")
-            positive_2 = paddle.to_variable(positive_2)
+            positive_2 = paddle.to_tensor(positive_2)
 
             repeat_times = np.array([2, 3]).astype("int32")
-            repeat_times = paddle.to_variable(repeat_times)
+            repeat_times = paddle.to_tensor(repeat_times)
 
             out_1 = paddle.tile(x, repeat_times=[2, 3])
             out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index bd76edc9d8cadf14c6cf224b7708ff4acd6efef4..7c7a71a3be1b508c850048c3945f29ef7424654c 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -234,23 +234,23 @@ class TestTransformer(unittest.TestCase):
                 if cache_dict:
                     if 'k' and 'v' in cache_dict:
                         cache_obj = multi_head_attn.Cache(
-                            paddle.to_variable(cache_dict['k']),
-                            paddle.to_variable(cache_dict['v']))
+                            paddle.to_tensor(cache_dict['k']),
+                            paddle.to_tensor(cache_dict['v']))
                     elif 'static_k' and 'static_v' in cache_dict:
                         cache_obj = multi_head_attn.StaticCache(
-                            paddle.to_variable(cache_dict['static_k']),
-                            paddle.to_variable(cache_dict['static_v']))
+                            paddle.to_tensor(cache_dict['static_k']),
+                            paddle.to_tensor(cache_dict['static_v']))
                 if attn_mask is not None:
                     attn_output = multi_head_attn(
-                        paddle.to_variable(query),
-                        paddle.to_variable(key),
-                        paddle.to_variable(value),
-                        paddle.to_variable(attn_mask), cache_obj)
+                        paddle.to_tensor(query),
+                        paddle.to_tensor(key),
+                        paddle.to_tensor(value),
+                        paddle.to_tensor(attn_mask), cache_obj)
                 else:
                     attn_output = multi_head_attn(
-                        paddle.to_variable(query),
-                        paddle.to_variable(key),
-                        paddle.to_variable(value), attn_mask, cache_obj)
+                        paddle.to_tensor(query),
+                        paddle.to_tensor(key),
+                        paddle.to_tensor(value), attn_mask, cache_obj)
                 attn_output = attn_output[0] if cache_dict else attn_output
 
                 # implementation by numpy
@@ -296,16 +296,16 @@ class TestTransformer(unittest.TestCase):
                 attn_dropout, act_dropout)
 
             encoder_output = encoder_layer(
-                paddle.to_variable(src),
-                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
+                paddle.to_tensor(src),
+                paddle.to_tensor(src_mask))  # paddle.to_tensor(src_mask))
             # 4.numpy:
             # paddle self attention
             self_attn = MultiHeadAttention(
                 d_model, n_head, dropout=attn_dropout)
             attn_output = self_attn(
-                paddle.to_variable(src),
-                paddle.to_variable(src),
-                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()
+                paddle.to_tensor(src),
+                paddle.to_tensor(src),
+                paddle.to_tensor(src), paddle.to_tensor(src_mask)).numpy()
 
             src = attn_output + residual
             src_norm = layer_norm(src, d_model, encoder_layer.norm1)
@@ -348,13 +348,13 @@ class TestTransformer(unittest.TestCase):
                 cache_objs = None
                 if cache:
                     cache_objs = decoder_layer.gen_cache(
-                        paddle.to_variable(memory))
+                        paddle.to_tensor(memory))
 
                 decoder_output = decoder_layer(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(tgt_mask),
-                    paddle.to_variable(memory_mask), cache_objs)
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(tgt_mask),
+                    paddle.to_tensor(memory_mask), cache_objs)
 
                 decoder_output = decoder_output[0].numpy(
                 ) if cache else decoder_output.numpy()
@@ -365,10 +365,10 @@ class TestTransformer(unittest.TestCase):
                 self_attn_cache = cache_objs[
                     0] if cache_objs is not None else None
                 tgt = self_attn(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt_mask), self_attn_cache)
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt_mask), self_attn_cache)
 
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
@@ -380,10 +380,10 @@ class TestTransformer(unittest.TestCase):
                 cross_attn_cache = cache_objs[
                     1] if cache_objs is not None else None
                 tgt = cross_attn(
-                    paddle.to_variable(tgt_norm),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(memory_mask), cross_attn_cache)
+                    paddle.to_tensor(tgt_norm),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(memory_mask), cross_attn_cache)
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
                 # postprocess
@@ -416,7 +416,7 @@ class TestTransformer(unittest.TestCase):
             encoder = TransformerEncoder(encoder_layer, num_layers)
             # src, src_mask
             enc_output = encoder(
-                paddle.to_variable(src), paddle.to_variable(src_mask))
+                paddle.to_tensor(src), paddle.to_tensor(src_mask))
 
     def test_decoder(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -438,9 +438,9 @@ class TestTransformer(unittest.TestCase):
             decoder = TransformerDecoder(decoder_layer, num_layers)
 
             output = decoder(
-                paddle.to_variable(tgt),
-                paddle.to_variable(memory),
-                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))
+                paddle.to_tensor(tgt),
+                paddle.to_tensor(memory),
+                paddle.to_tensor(tgt_mask), paddle.to_tensor(memory_mask))
 
     def test_transformer(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -453,24 +453,24 @@ class TestTransformer(unittest.TestCase):
                 n_head,
                 dim_feedforward=dim_feedforward,
                 dropout=dropout)
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 6bc42f0712a1a8c9f9a0640e06042c42e7cc948f..c4155e0d8260fe1fdc4a0e49e955fc2bbff0fc89 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -424,10 +424,10 @@ class TestCTCLossAPICase(unittest.TestCase):
         loss_np = ctc.forward()
 
         paddle.disable_static()
-        softmax = paddle.to_variable(logits)
-        labels = paddle.to_variable(labels)
-        logits_length = paddle.to_variable(self.logits_length)
-        labels_length = paddle.to_variable(self.labels_length)
+        softmax = paddle.to_tensor(logits)
+        labels = paddle.to_tensor(labels)
+        logits_length = paddle.to_tensor(self.logits_length)
+        labels_length = paddle.to_tensor(self.labels_length)
         loss_pd_mean = F.ctc_loss(
             softmax,
             labels,
@@ -477,10 +477,10 @@ class TestCTCLossAPICase(unittest.TestCase):
         loss_np = ctc.forward()
 
         paddle.disable_static()
-        softmax = paddle.to_variable(logits)
-        labels = paddle.to_variable(labels)
-        logits_length = paddle.to_variable(self.logits_length)
-        labels_length = paddle.to_variable(self.labels_length)
+        softmax = paddle.to_tensor(logits)
+        labels = paddle.to_tensor(labels)
+        logits_length = paddle.to_tensor(self.logits_length)
+        labels_length = paddle.to_tensor(self.labels_length)
 
         loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
             softmax, labels, logits_length, labels_length)
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f33e4e0fca8727574bcd1970e26c6eaee2139a05..2ce442add2e02b5b1b869b4b233e734bc29bdea9 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -48,8 +48,8 @@ from paddle.fluid import core  #DEFINE_ALIAS
 from ..fluid.dygraph.base import no_grad  #DEFINE_ALIAS
 from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
-from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
-from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from .io import save
+from .io import load
 from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..7175f3101448f3ee239c641fe0142ef211510a72
--- /dev/null
+++ b/python/paddle/framework/io.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import collections
+import pickle
+import six
+import warnings
+
+import paddle
+
+# deprecated module import
+from paddle import fluid
+from paddle.fluid import core
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+
+__all__ = [
+    'save',
+    'load',
+]
+
+
+def _build_saved_state_dict(state_dict):
+    save_dict = {}
+    name_table = {}
+    for key, value in state_dict.items():
+        if isinstance(value, (Variable, core.VarBase)):
+            save_dict[key] = value.numpy()
+            name_table[key] = value.name
+        else:
+            save_dict[key] = value
+    save_dict["StructuredToParameterName@@"] = name_table
+
+    return save_dict
+
+
+def _load_state_dict_from_save_inference_model(model_path, config):
+    # 1. load program desc & construct _ProgramHolder
+    programs = _construct_program_holders(model_path, config.model_filename)
+
+    # 2. load layer parameters & buffers
+    with fluid.dygraph.guard():
+        persistable_var_dict = _construct_params_and_buffers(
+            model_path,
+            programs,
+            config.separate_params,
+            config.params_filename,
+            append_suffix=False)
+
+        # 3. construct state_dict
+        load_param_dict = dict()
+        for var_name in persistable_var_dict:
+            load_param_dict[var_name] = persistable_var_dict[var_name].numpy()
+
+        # if __variables.info__ exists, we can recover structured_name
+        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        if os.path.exists(var_info_path):
+            with open(var_info_path, 'rb') as f:
+                extra_var_info = pickle.load(f)
+            structured_para_dict = dict()
+            for var_name in load_param_dict:
+                structured_name = extra_var_info[var_name].get(
+                    'structured_name', None)
+                assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                structured_para_dict[structured_name] = load_param_dict[
+                    var_name]
+            load_param_dict = structured_para_dict
+
+    return load_param_dict
+
+
+def _load_state_dict_from_save_params(model_path):
+    # Try to load all the files in the directory in VarBase format, 
+    # the file name is used as the name of VarBase
+    load_var_list = []
+
+    # 1. load file names
+    var_name_list = []
+    for root, _, files in os.walk(model_path):
+        for filename in files:
+            file_path = os.path.join(root, filename)
+            tmp_var_name = os.path.relpath(file_path, model_path)
+            var_name = tmp_var_name.replace("\\", "/")
+            var_name_list.append(var_name)
+
+    # 2. create and load VarBase
+    with fluid.dygraph.guard():
+        for name in var_name_list:
+            new_var = _varbase_creator(name=name, persistable=True)
+            _dygraph_tracer().trace_op(
+                type='load',
+                inputs={},
+                outputs={'Out': new_var},
+                attrs={'file_path': os.path.join(model_path, name)})
+            load_var_list.append(new_var)
+
+    # 3. construct state_dict
+    load_param_dict = dict()
+    for var in load_var_list:
+        load_param_dict[var.name] = var.numpy()
+
+    return load_param_dict
+
+
+def save(obj, path):
+    '''
+    Save an object to the specified path.
+    
+    .. note::
+        Now only supports save ``state_dict`` of Layer or Optimizer.
+    
+    Args:
+        obj(Object) : The object to be saved.
+        path(str) : The path of the object to be saved. 
+          If saved in the current directory, the input path string will be used as the file name. 
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.disable_static()
+
+            emb = paddle.nn.Embedding(10, 10)
+            layer_state_dict = emb.state_dict()
+            paddle.save(layer_state_dict, "emb.pdparams")
+
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+                d_model=0.01, warmup_steps=100, verbose=True)
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=emb.parameters())
+            opt_state_dict = adam.state_dict()
+            paddle.save(opt_state_dict, "adam.pdopt")
+    '''
+
+    # 1. input check
+    if not isinstance(obj, dict):
+        raise NotImplementedError(
+            "Now only supports save state_dict of Layer or Optimizer, "
+            "expect dict, but received %s." % type(obj))
+
+    if len(obj) == 0:
+        warnings.warn("The input state dict is empty, no need to save.")
+
+    filename = os.path.basename(path)
+    if filename == "":
+        raise ValueError("The input path MUST be format of dirname/filename "
+                         "[dirname\\filename in Windows system], but received "
+                         "filename is empty string.")
+
+    # 2. save object
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    # TODO(chenweihang): supports save other object
+    saved_obj = _build_saved_state_dict(obj)
+
+    with open(path, 'wb') as f:
+        pickle.dump(saved_obj, f, protocol=2)
+
+
+def load(path, config=None):
+    '''
+    Load an object can be used in paddle from specified path.
+
+    .. note::
+        Now only supports load ``state_dict`` of Layer or Optimizer.
+
+    .. note::
+        ``paddle.load`` supports loading ``state_dict`` from the result of several 
+        paddle1.x save APIs in static mode, but due to some historical reasons, 
+        if you load ``state_dict`` from the saved result of 
+        ``paddle.static.save_inference_model/paddle.fluid.io.save_params/paddle.fluid.io.save_persistables`` , 
+        the structured variable name will cannot be restored. You need to set the argument 
+        ``use_structured_name=False`` when using ``Layer.set_state_dict`` later.
+
+    Args:
+        path(str) : The path to load the target object. Generally, the path is the target 
+            file path, when compatible with loading the saved results of 
+            ``paddle.jit.save/paddle.static.save_inference_model`` , the path is a directory. 
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
+            object that specifies additional configuration options, these options 
+            are for compatibility with ``paddle.jit.save/paddle.static.save_inference_model`` 
+            formats. Default None.
+
+    Returns:
+        Object(Object): a target object can be used in paddle
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            paddle.disable_static()
+
+            emb = paddle.nn.Embedding(10, 10)
+            layer_state_dict = emb.state_dict()
+            paddle.save(layer_state_dict, "emb.pdparams")
+
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+                d_model=0.01, warmup_steps=100, verbose=True)
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=emb.parameters())
+            opt_state_dict = adam.state_dict()
+            paddle.save(opt_state_dict, "adam.pdopt")
+
+            load_layer_state_dict = paddle.load("emb.pdparams")
+            load_opt_state_dict = paddle.load("adam.pdopt")
+    '''
+    # 1. input check
+    if not os.path.exists(path):
+        error_msg = "The path `%s` does not exist."
+        # if current path is a prefix, and the path.pdparams or path.pdopt
+        # is exist, users may want use `paddle.load` load the result of 
+        # `fluid.save_dygraph`, we raise error here for users
+        params_file_path = path + ".pdparams"
+        opti_file_path = path + ".pdopt"
+        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
+            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
+                "please specify the full file name, not just the file name prefix. For " \
+                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
+                "`paddle.load('model')`."
+        raise ValueError(error_msg % path)
+
+    if config is None:
+        config = paddle.SaveLoadConfig()
+
+    # 2. load target
+    load_result = None
+    if os.path.isfile(path):
+        # we think path is file means this file is created by paddle.save
+        with open(path, 'rb') as f:
+            load_result = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
+
+        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+            del load_result["StructuredToParameterName@@"]
+    elif os.path.isdir(path):
+        # we think path is directory means compatible with loading 
+        # store results of static mode related save APIs
+
+        # check whether model file exists
+        if config.model_filename is None:
+            model_filename = '__model__'
+        else:
+            model_filename = config.model_filename
+        model_file_path = os.path.join(path, model_filename)
+
+        if os.path.exists(model_file_path):
+            # Load state dict by `jit.save/io.save_inference_model` save format
+            # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
+            # The model saved by `save_inference_model` does not completely correspond to 
+            # the information required by the `state_dict` under the dygraph. 
+            # `save_inference_model` not save structured name, we need to remind 
+            # the user to configure the `use_structured_name` argument when `set_state_dict`
+            # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
+            load_result = _load_state_dict_from_save_inference_model(path,
+                                                                     config)
+        else:
+            # load state dict by `io.save_params/persistables` save format
+            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # If users save all parameters as one file, the [ variable.name -> variable ]
+            # mapping info will lost, so users need to give variable list, but users build 
+            # variable list in dygraph mode is difficult, we recommend users to use
+            # paddle.static.load_program_state in this case
+            load_result = _load_state_dict_from_save_params(path)
+    else:
+        raise ValueError(
+            "Unsupported path format, now only supports file or directory.")
+
+    return load_result
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 7ed571fa9c6a4a962b20397c999368dad0734ff0..69b7fedd72eed52cfd06715025f5cd88983e2e2a 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -301,10 +301,11 @@ class ProgBarLogger(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
@@ -436,10 +437,11 @@ class ModelCheckpoint(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index d41852c9d7f4f5812a852d6a5c644e75d137f530..1bfe8f07a2fbd90837eaaea28616b64b9d4ac26a 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -814,10 +814,9 @@ class Model(object):
         from paddle.static import InputSpec
 
         device = paddle.set_device('cpu') # or 'gpu'
-        # if use static graph, do not set
-        paddle.disable_static(device)
 
         net = nn.Sequential(
+            nn.Flatten(1),
             nn.Linear(784, 200),
             nn.Tanh(),
             nn.Linear(200, 10))
@@ -833,7 +832,7 @@ class Model(object):
                       paddle.nn.CrossEntropyLoss(),
                       paddle.metric.Accuracy())
         
-        data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+        data = paddle.vision.datasets.MNIST(mode='train')
         model.fit(data, epochs=2, batch_size=32, verbose=1)
     """
 
@@ -850,7 +849,8 @@ class Model(object):
 
         if not isinstance(inputs, (list, dict, Input)):
             raise TypeError(
-                "'inputs' must be list or dict, and couldn't be None.")
+                "'inputs' must be list or dict in static graph mode")
+
         self._inputs = self._verify_spec(inputs, True)
         self._labels = self._verify_spec(labels)
 
@@ -885,7 +885,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -930,7 +929,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -970,9 +968,12 @@ class Model(object):
               import numpy as np
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
+              
+              input = InputSpec([None, 784], 'float32', 'x')
+              label = InputSpec([None, 1], 'int64', 'label')
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -980,7 +981,7 @@ class Model(object):
                   nn.Linear(200, 10),
                   nn.Softmax())
 
-              model = paddle.Model(net)
+              model = paddle.Model(net, input, label)
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
               out = model.test_batch([data])
@@ -1026,6 +1027,7 @@ class Model(object):
                     def __init__(self):
                         super(Mnist, self).__init__()
                         self.net = nn.Sequential(
+                            nn.Flatten(1),
                             nn.Linear(784, 200),
                             nn.Tanh(),
                             nn.Linear(200, 10),
@@ -1045,7 +1047,7 @@ class Model(object):
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
                     parameters=model.parameters())
                 model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+                data = paddle.vision.datasets.MNIST(mode='train')
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
@@ -1092,15 +1094,18 @@ class Model(object):
             
               import paddle
               import paddle.nn as nn
-              
+              from paddle.static import InputSpec
+
               device = paddle.set_device('cpu')
-              paddle.disable_static(device)
+
+              input = InputSpec([None, 784], 'float32', 'x')
 
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
                   nn.Linear(200, 10),
-                  nn.Softmax()))
+                  nn.Softmax()), input)
+
               model.save('checkpoint/test')
               model.load('checkpoint/test')
         """
@@ -1165,13 +1170,15 @@ class Model(object):
 
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
-              paddle.disable_static()
-
+              input = InputSpec([None, 784], 'float32', 'x')
+              
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
-                  nn.Linear(200, 10)))
+                  nn.Linear(200, 10)), input)
+
               params = model.parameters()
         """
         return self._adapter.parameters()
@@ -1313,7 +1320,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None),
+                  paddle.vision.models.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
@@ -1350,7 +1357,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None), input, label)
+                  paddle.vision.models.LeNet(), input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
               model.prepare(
@@ -1483,7 +1490,7 @@ class Model(object):
 
             # imperative mode
             paddle.disable_static()
-            model = paddle.Model(paddle.vision.models.LeNet())
+            model = paddle.Model(paddle.vision.models.LeNet(), input, label)
             model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
@@ -1580,19 +1587,20 @@ class Model(object):
 
             test_dataset = MnistDataset(mode='test', return_label=False)
 
-            # declarative mode
+            # imperative mode
             input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
             model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
-
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
 
-            # imperative mode
+            # declarative mode
             device = paddle.set_device('cpu')
-            paddle.disable_static(device)
-            model = paddle.Model(paddle.vision.models.LeNet())
+            paddle.enable_static()
+            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+            model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
+
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
         """
@@ -1813,7 +1821,7 @@ class Model(object):
             return logs, outputs
         return logs
 
-    def summary(self, input_size=None, batch_size=None, dtype=None):
+    def summary(self, input_size=None, dtype=None):
         """Prints a string summary of the network.
 
         Args:
@@ -1822,7 +1830,6 @@ class Model(object):
                     one input, input_size can be tuple or InputSpec. if model have multiple 
                     input, input_size must be a list which contain every input's shape. 
                     Default: None.
-            batch_size (int, optional): batch size of input tensor, Default: None.
             dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
 
         Returns:
@@ -1833,15 +1840,11 @@ class Model(object):
 
               import paddle
               from paddle.static import InputSpec
-
-              dynamic = True
-              device = paddle.set_device('cpu')
-              paddle.disable_static(device) if dynamic else None
            
               input = InputSpec([None, 1, 28, 28], 'float32', 'image')
               label = InputSpec([None, 1], 'int64', 'label')
            
-              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+              model = paddle.Model(paddle.vision.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
@@ -1859,7 +1862,7 @@ class Model(object):
             _input_size = input_size
         else:
             _input_size = self._inputs
-        return summary(self.network, _input_size, batch_size, dtype)
+        return summary(self.network, _input_size, dtype)
 
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index d388ba62f2a244f84497810739e5fd6b50f669d2..3ead3fc295c0b2e8772b16e2aeb3a4fd1f2be75a 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -25,7 +25,7 @@ from collections import OrderedDict
 __all__ = ['summary']
 
 
-def summary(net, input_size, batch_size=None, dtypes=None):
+def summary(net, input_size, dtypes=None):
     """Prints a string summary of the network.
 
     Args:
@@ -33,8 +33,8 @@ def summary(net, input_size, batch_size=None, dtypes=None):
         input_size (tuple|InputSpec|list[tuple|InputSpec]): size of input tensor. if model only 
                     have one input, input_size can be tuple or InputSpec. if model
                     have multiple input, input_size must be a list which contain 
-                    every input's shape.
-        batch_size (int, optional): batch size of input tensor, Default: None.
+                    every input's shape. Note that input_size only dim of
+                    batch_size can be None or -1.
         dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
 
     Returns:
@@ -77,14 +77,12 @@ def summary(net, input_size, batch_size=None, dtypes=None):
 
             lenet = LeNet()
 
-            params_info = paddle.summary(lenet, (1, 28, 28))
+            params_info = paddle.summary(lenet, (1, 1, 28, 28))
             print(params_info)
 
     """
     if isinstance(input_size, InputSpec):
-        _input_size = tuple(input_size.shape[1:])
-        if batch_size is None:
-            batch_size = input_size.shape[0]
+        _input_size = tuple(input_size.shape)
     elif isinstance(input_size, list):
         _input_size = []
         for item in input_size:
@@ -96,9 +94,7 @@ def summary(net, input_size, batch_size=None, dtypes=None):
                                   type(item))
 
             if isinstance(item, InputSpec):
-                _input_size.append(tuple(item.shape[1:]))
-                if batch_size is None:
-                    batch_size = item.shape[0]
+                _input_size.append(tuple(item.shape))
             else:
                 _input_size.append(item)
     elif isinstance(input_size, int):
@@ -106,28 +102,88 @@ def summary(net, input_size, batch_size=None, dtypes=None):
     else:
         _input_size = input_size
 
-    if batch_size is None:
-        batch_size = -1
-
     if not paddle.in_dynamic_mode():
         warnings.warn(
             "Your model was created in static mode, this may not get correct summary information!"
         )
 
-    result, params_info = summary_string(net, _input_size, batch_size, dtypes)
+    def _is_shape(shape):
+        for item in shape:
+            if isinstance(item, (list, tuple)):
+                return False
+        return True
+
+    def _check_shape(shape):
+        num_unknown = 0
+        new_shape = []
+        for i in range(len(shape)):
+            item = shape[i]
+            if item is None or item == -1:
+                num_unknown += 1
+                if num_unknown > 1:
+                    raise ValueError(
+                        'Option input_size only the dim of batch_size can be None or -1.'
+                    )
+                item = 1
+            elif isinstance(item, numbers.Number):
+                if item <= 0:
+                    raise ValueError(
+                        "Expected element in input size greater than zero, but got {}".
+                        format(item))
+            new_shape.append(item)
+        return tuple(new_shape)
+
+    def _check_input(input_size):
+        if isinstance(input_size, (list, tuple)) and _is_shape(input_size):
+            return _check_shape(input_size)
+        else:
+            return [_check_input(i) for i in input_size]
+
+    _input_size = _check_input(_input_size)
+    result, params_info = summary_string(net, _input_size, dtypes)
     print(result)
 
     return params_info
 
 
-def summary_string(model, input_size, batch_size=-1, dtypes=None):
-    if dtypes == None:
-        dtypes = ['float32'] * len(input_size)
+def summary_string(model, input_size, dtypes=None):
+    def _all_is_numper(items):
+        for item in items:
+            if not isinstance(item, numbers.Number):
+                return False
+        return True
+
+    def _build_dtypes(input_size, dtype):
+        if dtype is None:
+            dtype = 'float32'
+
+        if isinstance(input_size, (list, tuple)) and _all_is_numper(input_size):
+            return [dtype]
+        else:
+            return [_build_dtypes(i, dtype) for i in input_size]
+
+    if not isinstance(dtypes, (list, tuple)):
+        dtypes = _build_dtypes(input_size, dtypes)
+
+    batch_size = 1
 
     summary_str = ''
 
     depth = len(list(model.sublayers()))
 
+    def _get_shape_from_tensor(x):
+        if isinstance(x, (paddle.fluid.Variable, paddle.fluid.core.VarBase)):
+            return list(x.shape)
+        elif isinstance(x, (list, tuple)):
+            return [_get_shape_from_tensor(xx) for xx in x]
+
+    def _get_output_shape(output):
+        if isinstance(output, (list, tuple)):
+            output_shape = [_get_output_shape(o) for o in output]
+        else:
+            output_shape = list(output.shape)
+        return output_shape
+
     def register_hook(layer):
         def hook(layer, input, output):
             class_name = str(layer.__class__).split(".")[-1].split("'")[0]
@@ -139,14 +195,18 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
 
             m_key = "%s-%i" % (class_name, layer_idx + 1)
             summary[m_key] = OrderedDict()
-            summary[m_key]["input_shape"] = list(input[0].shape)
-            summary[m_key]["input_shape"][0] = batch_size
-            if isinstance(output, (list, tuple)):
-                summary[m_key]["output_shape"] = [[-1] + list(o.shape)[1:]
-                                                  for o in output]
-            else:
-                summary[m_key]["output_shape"] = list(output.shape)
-                summary[m_key]["output_shape"][0] = batch_size
+
+            try:
+                summary[m_key]["input_shape"] = _get_shape_from_tensor(input)
+            except:
+                warnings.warn('Get layer {} input shape failed!')
+                summary[m_key]["input_shape"] = []
+
+            try:
+                summary[m_key]["output_shape"] = _get_output_shape(output)
+            except:
+                warnings.warn('Get layer {} output shape failed!')
+                summary[m_key]["output_shape"]
 
             params = 0
 
@@ -175,29 +235,22 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
 
             hooks.append(layer.register_forward_post_hook(hook))
 
-    def _check_input_size(input_sizes):
-        for input_size in input_sizes:
-            for item in input_size:
-                if not isinstance(item, numbers.Number):
-                    raise TypeError(
-                        "Expected item in input size be a number, but got {}".
-                        format(type(item)))
-
-                if item <= 0:
-                    raise ValueError(
-                        "Expected item in input size greater than zero, but got {}".
-                        format(item))
-
     if isinstance(input_size, tuple):
         input_size = [input_size]
 
-    _check_input_size(input_size)
+    def build_input(input_size, dtypes):
+        if isinstance(input_size, (list, tuple)) and _all_is_numper(input_size):
+            if isinstance(dtypes, (list, tuple)):
+                dtype = dtypes[0]
+            else:
+                dtype = dtypes
+            return paddle.rand(list(input_size), dtype)
+        else:
+            return [
+                build_input(i, dtype) for i, dtype in zip(input_size, dtypes)
+            ]
 
-    x = [
-        paddle.rand(
-            [2] + list(in_size), dtype=dtype)
-        for in_size, dtype in zip(input_size, dtypes)
-    ]
+    x = build_input(input_size, dtypes)
 
     # create properties
     summary = OrderedDict()
@@ -213,22 +266,65 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
     for h in hooks:
         h.remove()
 
-    table_width = 80
-    summary_str += "-" * table_width + "\n"
-    line_new = "{:>15} {:>20} {:>20} {:>15}".format(
-        "Layer (type)", "Input Shape", "Output Shape", "Param #")
+    def _get_str_length(summary):
+        head_length = {
+            'layer_width': 15,
+            'input_shape_width': 20,
+            'output_shape_width': 20,
+            'params_width': 15,
+            'table_width': 75
+        }
+
+        for layer in summary:
+            if head_length['output_shape_width'] < len(
+                    str(summary[layer]["output_shape"])):
+                head_length['output_shape_width'] = len(
+                    str(summary[layer]["output_shape"]))
+            if head_length['input_shape_width'] < len(
+                    str(summary[layer]["input_shape"])):
+                head_length['input_shape_width'] = len(
+                    str(summary[layer]["input_shape"]))
+            if head_length['layer_width'] < len(str(layer)):
+                head_length['layer_width'] = len(str(layer))
+            if head_length['params_width'] < len(
+                    str(summary[layer]["nb_params"])):
+                head_length['params_width'] = len(
+                    str(summary[layer]["nb_params"]))
+
+        _temp_width = 0
+        for k, v in head_length.items():
+            if k != 'table_width':
+                _temp_width += v
+
+        if head_length['table_width'] < _temp_width + 5:
+            head_length['table_width'] = _temp_width + 5
+
+        return head_length
+
+    table_width = _get_str_length(summary)
+
+    summary_str += "-" * table_width['table_width'] + "\n"
+    line_new = "{:^{}} {:^{}} {:^{}} {:^{}}".format(
+        "Layer (type)", table_width['layer_width'], "Input Shape",
+        table_width['input_shape_width'], "Output Shape",
+        table_width['output_shape_width'], "Param #",
+        table_width['params_width'])
     summary_str += line_new + "\n"
-    summary_str += "=" * table_width + "\n"
+    summary_str += "=" * table_width['table_width'] + "\n"
     total_params = 0
     total_output = 0
     trainable_params = 0
+    max_length = 0
     for layer in summary:
         # input_shape, output_shape, trainable, nb_params
-        line_new = "{:>15} {:>20} {:>20} {:>15}".format(
-            layer,
+        line_new = "{:^{}} {:^{}} {:^{}} {:^{}}".format(
+            layer, table_width['layer_width'],
             str(summary[layer]["input_shape"]),
+            table_width['input_shape_width'],
             str(summary[layer]["output_shape"]),
-            "{0:,}".format(summary[layer]["nb_params"]), )
+            table_width['output_shape_width'],
+            "{0:,}".format(summary[layer]["nb_params"]),
+            table_width['params_width'])
         total_params += summary[layer]["nb_params"]
 
         try:
@@ -242,25 +338,32 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
                 trainable_params += summary[layer]["nb_params"]
         summary_str += line_new + "\n"
 
-    # assume 4 bytes/number (float on cuda).
-    total_input_size = abs(
-        np.prod(sum(input_size, ())) * batch_size * 4. / (1024**2.))
+    def _get_input_size(input_size, size):
+        if isinstance(input_size, (list, tuple)) and _all_is_numper(input_size):
+            size = abs(np.prod(input_size) * 4. / (1024**2.))
+        else:
+            size = sum([_get_input_size(i, size) for i in input_size])
+        return size
+
+    total_input_size = _get_input_size(input_size, 0)
+
     total_output_size = abs(2. * total_output * 4. /
                             (1024**2.))  # x2 for gradients
     total_params_size = abs(total_params * 4. / (1024**2.))
     total_size = total_params_size + total_output_size + total_input_size
 
-    summary_str += "=" * table_width + "\n"
+    summary_str += "=" * table_width['table_width'] + "\n"
     summary_str += "Total params: {0:,}".format(total_params) + "\n"
     summary_str += "Trainable params: {0:,}".format(trainable_params) + "\n"
     summary_str += "Non-trainable params: {0:,}".format(total_params -
                                                         trainable_params) + "\n"
-    summary_str += "-" * table_width + "\n"
+    summary_str += "-" * table_width['table_width'] + "\n"
     summary_str += "Input size (MB): %0.2f" % total_input_size + "\n"
     summary_str += "Forward/backward pass size (MB): %0.2f" % total_output_size + "\n"
     summary_str += "Params size (MB): %0.2f" % total_params_size + "\n"
     summary_str += "Estimated Total Size (MB): %0.2f" % total_size + "\n"
-    summary_str += "-" * table_width + "\n"
+    summary_str += "-" * table_width['table_width'] + "\n"
+
     # return summary
     return summary_str, {
         'total_params': total_params,
diff --git a/python/paddle/tensor/io.py b/python/paddle/inference/__init__.py
similarity index 78%
rename from python/paddle/tensor/io.py
rename to python/paddle/inference/__init__.py
index 66e956e8e4bf0c3beb8f0d71a63b2d9407ef22fc..c388301ec3408e436eacb2567e8e529d0bbc03bb 100644
--- a/python/paddle/tensor/io.py
+++ b/python/paddle/inference/__init__.py
@@ -12,8 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define functions to save & load a tensor  
-from ..fluid import save  #DEFINE_ALIAS
-from ..fluid.io import load  #DEFINE_ALIAS
-
-__all__ = ['save', 'load']
+from ..fluid.inference import Config, DataType, PlaceType, PrecisionType, Tensor, \
+    Predictor, create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 6f0b0f3c9c135e00a01c69869742a40ff615a96b..92dd819b3cd5eb2a7c1df479ea1ede288c5ba8f1 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -25,16 +25,8 @@ __all__ = [
     'Sampler',
     'SequenceSampler',
     'RandomSampler',
-    'load',
-    'save',
-    'load_program_state',
-    'set_program_state',
-    'load_inference_model',
-    'save_inference_model',
 ]
 
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
         TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler
-from ..fluid.io import load, save, load_program_state, set_program_state, \
-        load_inference_model, save_inference_model, batch
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 1cd65171ff034e8b834c38184e4452796da985ca..f4a9b8c01d02a109f91aa717342ba47321f5f47e 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,6 @@ class Accuracy(Metric):
         import numpy as np
         import paddle
 
-        paddle.disable_static()
         x = paddle.to_tensor(np.array([
             [0.1, 0.2, 0.3, 0.4],
             [0.1, 0.4, 0.3, 0.2],
@@ -202,11 +201,13 @@ class Accuracy(Metric):
         .. code-block:: python
 
         import paddle
-
-        paddle.disable_static()
+        from paddle.static import InputSpec
+           
+        input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+        label = InputSpec([None, 1], 'int64', 'label')
         train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-        model = paddle.Model(paddle.vision.LeNet(classifier_activation=None))
+        model = paddle.Model(paddle.vision.LeNet(), input, label)
         optim = paddle.optimizer.Adam(
             learning_rate=0.001, parameters=model.parameters())
         model.prepare(
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 91a2a78203cbc50fec27b4f3ae8d3541ac4ec5da..8ee4d73ea847ea116ea4401b5b05ef1b925950fe 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -62,6 +62,22 @@ def cache(reader):
 
     Returns:
         generator: a decorated reader object which yields data from cached memory.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # All data is cached into memory
+            cached_reader = paddle.io.cache(reader)
+            
+            # Output: 0 1 2
+            for i in cached_reader():
+                print(i)
     """
     all_data = tuple(reader())
 
@@ -296,12 +312,28 @@ def buffered(reader, size):
     buffer. Reading from the buffered data reader will proceed as long
     as the buffer is not empty.
 
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param size: max buffer size.
-    :type size: int
+    Args:
+        reader(generator): the data reader to read from.
+        size(int): max buffer size.
+
+    Returns:
+        generator: the buffered data reader.
+    
+    Examples:
+        .. code-block:: python
 
-    :returns: the buffered data reader.
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # Create a buffered reader, and the buffer size is 2.
+            buffered_reader = paddle.io.buffered(reader, 2)
+            
+            # Output: 0 1 2
+            for i in buffered_reader():
+                print(i)
     """
 
     class EndSignal():
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 42a28a4f04e368cf8a1c1a144639bc743234a540..e0a9bc6eec3ba37049cc670a63fbf54f68c5defb 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -17,8 +17,9 @@ __all__ = [
     'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
     'BuildStrategy', 'CompiledProgram', 'Print', 'py_func', 'ExecutionStrategy',
     'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
-    'default_main_program', 'default_startup_program', 'Program', 'save',
-    'load', 'data', 'InputSpec'
+    'default_main_program', 'default_startup_program', 'Program', 'data',
+    'InputSpec', 'save', 'load', 'save_inference_model', 'load_inference_model',
+    'load_program_state', 'set_program_state'
 ]
 
 from . import nn
@@ -41,5 +42,9 @@ from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
 from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
 from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS
 from ..fluid.param_attr import WeightNormParamAttr  #DEFINE_ALIAS
-from ..tensor.io import save  #DEFINE_ALIAS
-from ..tensor.io import load  #DEFINE_ALIAS
+from ..fluid.io import save  #DEFINE_ALIAS
+from ..fluid.io import load  #DEFINE_ALIAS
+from ..fluid.io import save_inference_model  #DEFINE_ALIAS
+from ..fluid.io import load_inference_model  #DEFINE_ALIAS
+from ..fluid.io import load_program_state  #DEFINE_ALIAS
+from ..fluid.io import set_program_state  #DEFINE_ALIAS
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index eb70320ea7551de6e1117900e3769f000fdf23dd..d7a3cfcdb92debe0447cb4054478729e92dbab32 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -19,10 +19,12 @@ from paddle.fluid import core, Variable
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from paddle.fluid.framework import static_only
 
 __all__ = ['data', 'InputSpec']
 
 
+@static_only
 def data(name, shape, dtype=None, lod_level=0):
     """
     **Data Layer**
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index cec989fba8b0887499876f94bb862f72ba0e18d5..b6bab16c9680cf839be3b94b9a2c0fbe92d26dd9 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -42,8 +42,6 @@ from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
 from .creation import empty  #DEFINE_ALIAS
 from .creation import empty_like  #DEFINE_ALIAS
-from .io import save  #DEFINE_ALIAS
-from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
 from .linalg import dot  #DEFINE_ALIAS
 # from .linalg import einsum        #DEFINE_ALIAS
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index f27cfba487d78f284408815eaba933b18f303df9..26624d3b5ffbce371840c54580a0696cc8239402 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(val, name, ['float32', 'float64'],
-                                     'matmul')
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul')
 
     __check_input(x, y)
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9de407841fb461713d00f997afdf33a38a531245..dc6a04a4723bd92dbe1c76fce5b3e52981136211 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -53,7 +53,7 @@ __all__ = [
     'shard_index',
     'slice',
     'split',
-    'chunk'
+    'chunk',
     'squeeze',
     'stack',
     'strided_slice',
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index 13d966bf38f2aaed35e120aa4d25705cfc36c230..46d02789402b22263cfbd8cbdfeb6d66a5de900d 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -68,7 +68,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py
index 9d8e5f3652c9810579a0b66035a64d1d3b915bff..eab34a6dafbc354a24aa51e93a9fec9efc3b3cee 100644
--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -67,7 +67,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/test_dataset_movie_reviews.py b/python/paddle/tests/test_dataset_movie_reviews.py
deleted file mode 100644
index e6e6667013f89aca305f82a744c00de2af818736..0000000000000000000000000000000000000000
--- a/python/paddle/tests/test_dataset_movie_reviews.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-from paddle.text.datasets import *
-
-
-class TestMovieReviewsTrain(unittest.TestCase):
-    def test_main(self):
-        movie_reviews = MovieReviews(mode='train')
-        self.assertTrue(len(movie_reviews) == 1600)
-
-        # traversal whole dataset may cost a
-        # long time, randomly check 1 sample
-        idx = np.random.randint(0, 1600)
-        data = movie_reviews[idx]
-        self.assertTrue(len(data) == 2)
-        self.assertTrue(len(data[0].shape) == 1)
-        self.assertTrue(int(data[1]) in [0, 1])
-
-
-class TestMovieReviewsTest(unittest.TestCase):
-    def test_main(self):
-        movie_reviews = MovieReviews(mode='test')
-        self.assertTrue(len(movie_reviews) == 400)
-
-        # traversal whole dataset may cost a
-        # long time, randomly check 1 sample
-        idx = np.random.randint(0, 400)
-        data = movie_reviews[idx]
-        self.assertTrue(len(data) == 2)
-        self.assertTrue(len(data[0].shape) == 1)
-        self.assertTrue(int(data[1]) in [0, 1])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 62cc39c1f7b5303d98bffd9eb5814d4579a6d3f1..5a3d837407b14be4ab3ce9ae189fbc1d09b2b5ff 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -40,7 +40,7 @@ from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTra
 
 
 class LeNetDygraph(paddle.nn.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
+    def __init__(self, num_classes=10):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
@@ -55,8 +55,7 @@ class LeNetDygraph(paddle.nn.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -67,6 +66,34 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -198,7 +225,7 @@ class TestModel(unittest.TestCase):
         paddle.manual_seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
-        net = LeNet(classifier_activation=None)
+        net = LeNet()
         optim_new = fluid.optimizer.Adam(
             learning_rate=0.001, parameter_list=net.parameters())
         model = Model(net, inputs=self.inputs, labels=self.labels)
@@ -287,14 +314,12 @@ class TestModel(unittest.TestCase):
 
 
 class MyModel(paddle.nn.Layer):
-    def __init__(self, classifier_activation='softmax'):
+    def __init__(self):
         super(MyModel, self).__init__()
         self._fc = Linear(20, 10)
-        self._act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         y = self._fc(x)
-        y = self._act(y)
         return y
 
 
@@ -311,7 +336,7 @@ class TestModelFunction(unittest.TestCase):
         def get_expect():
             fluid.enable_dygraph(fluid.CPUPlace())
             self.set_seed()
-            m = MyModel(classifier_activation=None)
+            m = MyModel()
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=m.parameters())
             m.train()
@@ -330,7 +355,7 @@ class TestModelFunction(unittest.TestCase):
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
 
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
@@ -374,7 +399,7 @@ class TestModelFunction(unittest.TestCase):
         for dynamic in [True, False]:
             device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             inputs = [InputSpec([None, 20], 'float32', 'x')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -417,7 +442,7 @@ class TestModelFunction(unittest.TestCase):
         fluid.enable_dygraph(device)
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -426,7 +451,7 @@ class TestModelFunction(unittest.TestCase):
 
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -436,7 +461,7 @@ class TestModelFunction(unittest.TestCase):
     def test_static_save_dynamic_load(self):
         path = tempfile.mkdtemp()
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -448,7 +473,7 @@ class TestModelFunction(unittest.TestCase):
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -494,17 +519,22 @@ class TestModelFunction(unittest.TestCase):
 
             model.summary(input_size=(20))
             model.summary(input_size=[(20)])
-            model.summary(input_size=(20), batch_size=2)
+            model.summary(input_size=(20), dtype='float32')
 
     def test_summary_nlp(self):
         paddle.enable_static()
-        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
-        paddle.summary(nlp_net, (1, 2))
+        nlp_net = paddle.nn.GRU(input_size=2,
+                                hidden_size=3,
+                                num_layers=3,
+                                direction="bidirectional")
+        paddle.summary(nlp_net, (1, 1, 2))
+        rnn = paddle.nn.LSTM(16, 32, 2)
+        paddle.summary(rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
 
     def test_summary_error(self):
         with self.assertRaises(TypeError):
             nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
-            paddle.summary(nlp_net, (1, '2'))
+            paddle.summary(nlp_net, (1, 1, '2'))
 
         with self.assertRaises(ValueError):
             nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
@@ -512,7 +542,7 @@ class TestModelFunction(unittest.TestCase):
 
         paddle.disable_static()
         nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
-        paddle.summary(nlp_net, (1, 2))
+        paddle.summary(nlp_net, (1, 1, 2))
 
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
@@ -552,7 +582,7 @@ class TestModelFunction(unittest.TestCase):
 
 class TestRaiseError(unittest.TestCase):
     def test_input_without_name(self):
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
 
         inputs = [InputSpec([None, 10], 'float32')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
@@ -562,7 +592,7 @@ class TestRaiseError(unittest.TestCase):
     def test_input_without_input_spec(self):
         for dynamic in [True, False]:
             paddle.disable_static() if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             with self.assertRaises(TypeError):
                 model = Model(net)
             paddle.enable_static()
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index 641147d39e94f7c2bbb426900ed484546bad49c6..bf9c2a2ae061179bd9d656fa3cb23c5ac93c6c53 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import unittest
+import tempfile
+import shutil
 import numpy as np
 
 import paddle
@@ -23,27 +25,36 @@ import paddle.vision.models as models
 # test the predicted resutls of static graph and dynamic graph are equal
 # when used pretrained model
 class TestPretrainedModel(unittest.TestCase):
-    def infer(self, x, arch, dygraph=True):
-        if dygraph:
-            paddle.disable_static()
-
-        net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
-        model = paddle.Model(network=net, inputs=inputs)
-        model.prepare()
-        res = model.test_batch(x)
-
-        if dygraph:
-            paddle.enable_static()
-        return res
+    def infer(self, arch):
+        path = tempfile.mkdtemp()
+        x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
+        res = {}
+        for dygraph in [True, False]:
+            if not dygraph:
+                paddle.enable_static()
+
+            net = models.__dict__[arch]()
+            inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
+            model = paddle.Model(network=net, inputs=inputs)
+            model.prepare()
+
+            if dygraph:
+                model.save(path)
+                res['dygraph'] = model.test_batch(x)
+            else:
+                model.load(path)
+                res['static'] = model.test_batch(x)
+
+            if not dygraph:
+                paddle.disable_static()
+
+        shutil.rmtree(path)
+        np.testing.assert_allclose(res['dygraph'], res['static'])
 
     def test_models(self):
         arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18']
         for arch in arches:
-            x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
-            y_dygraph = self.infer(x, arch)
-            y_static = self.infer(x, arch, dygraph=False)
-            np.testing.assert_allclose(y_dygraph, y_static)
+            self.infer(arch)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 44f9ab5390122f086af4168e225fe2b5a2d8a9b2..6489b02615bb94269f83c4ed780e555c487eacbe 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -36,7 +36,7 @@ class TestVisonModels(unittest.TestCase):
         model.test_batch(x)
 
     def test_mobilenetv2_pretrained(self):
-        self.models_infer('mobilenet_v2', pretrained=True)
+        self.models_infer('mobilenet_v2', pretrained=False)
 
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
diff --git a/python/paddle/text/datasets/__init__.py b/python/paddle/text/datasets/__init__.py
index b5cea40a4f4924fee7a76bad6030a21fa5a61268..71571d09b5c2bde8ba970624195973d2a1771789 100644
--- a/python/paddle/text/datasets/__init__.py
+++ b/python/paddle/text/datasets/__init__.py
@@ -16,7 +16,6 @@ from . import conll05
 from . import imdb
 from . import imikolov
 from . import movielens
-from . import movie_reviews
 from . import uci_housing
 from . import wmt14
 from . import wmt16
@@ -25,7 +24,6 @@ from .conll05 import *
 from .imdb import *
 from .imikolov import *
 from .movielens import *
-from .movie_reviews import *
 from .uci_housing import *
 from .wmt14 import *
 from .wmt16 import *
@@ -34,7 +32,6 @@ __all__ = conll05.__all__ \
           + imdb.__all__ \
           + imikolov.__all__ \
           + movielens.__all__ \
-          + movie_reviews.__all__ \
           + uci_housing.__all__ \
           + wmt14.__all__ \
           + wmt16.__all__
diff --git a/python/paddle/text/datasets/movie_reviews.py b/python/paddle/text/datasets/movie_reviews.py
deleted file mode 100644
index db5b15654f96712abc842ca0c99654c1b7378808..0000000000000000000000000000000000000000
--- a/python/paddle/text/datasets/movie_reviews.py
+++ /dev/null
@@ -1,173 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import six
-import numpy as np
-import collections
-import nltk
-from nltk.corpus import movie_reviews
-import zipfile
-from functools import cmp_to_key
-from itertools import chain
-
-import paddle
-from paddle.io import Dataset
-
-__all__ = ['MovieReviews']
-
-URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
-MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
-
-NUM_TRAINING_INSTANCES = 1600
-NUM_TOTAL_INSTANCES = 2000
-
-
-class MovieReviews(Dataset):
-    """
-    Implementation of `NLTK movie reviews <http://www.nltk.org/nltk_data/>`_ dataset.
-
-    Args:
-        data_file(str): path to data tar file, can be set None if
-            :attr:`download` is True. Default None
-        mode(str): 'train' 'test' mode. Default 'train'.
-        download(bool): whether auto download cifar dataset if
-            :attr:`data_file` unset. Default True.
-
-    Returns:
-        Dataset: instance of movie reviews dataset
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            from paddle.text.datasets import MovieReviews
-
-            class SimpleNet(paddle.nn.Layer):
-                def __init__(self):
-                    super(SimpleNet, self).__init__()
-
-                def forward(self, word, category):
-                    return paddle.sum(word), category
-
-            paddle.disable_static()
-
-            movie_reviews = MovieReviews(mode='train')
-
-            for i in range(10):
-                word_list, category = movie_reviews[i]
-                word_list = paddle.to_tensor(word_list)
-                category = paddle.to_tensor(category)
-
-                model = SimpleNet()
-                word_list, category = model(word_list, category)
-                print(word_list.numpy().shape, category.numpy())
-
-    """
-
-    def __init__(self, mode='train'):
-        assert mode.lower() in ['train', 'test'], \
-            "mode should be 'train', 'test', but got {}".format(mode)
-        self.mode = mode.lower()
-
-        self._download_data_if_not_yet()
-
-        # read dataset into memory
-        self._load_sentiment_data()
-
-    def _get_word_dict(self):
-        """
-        Sorted the words by the frequency of words which occur in sample
-        :return:
-            words_freq_sorted
-        """
-        words_freq_sorted = list()
-        word_freq_dict = collections.defaultdict(int)
-
-        for category in movie_reviews.categories():
-            for field in movie_reviews.fileids(category):
-                for words in movie_reviews.words(field):
-                    word_freq_dict[words] += 1
-        words_sort_list = list(six.iteritems(word_freq_dict))
-        words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
-        for index, word in enumerate(words_sort_list):
-            words_freq_sorted.append((word[0], index))
-        return words_freq_sorted
-
-    def _sort_files(self):
-        """
-        Sorted the sample for cross reading the sample
-        :return:
-            files_list
-        """
-        files_list = list()
-        neg_file_list = movie_reviews.fileids('neg')
-        pos_file_list = movie_reviews.fileids('pos')
-        files_list = list(
-            chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
-        return files_list
-
-    def _load_sentiment_data(self):
-        """
-        Load the data set
-        :return:
-            data_set
-        """
-        self.data = []
-        words_ids = dict(self._get_word_dict())
-        for sample_file in self._sort_files():
-            words_list = list()
-            category = 0 if 'neg' in sample_file else 1
-            for word in movie_reviews.words(sample_file):
-                words_list.append(words_ids[word.lower()])
-            self.data.append((words_list, category))
-
-    def _download_data_if_not_yet(self):
-        """
-        Download the data set, if the data set is not download.
-        """
-        try:
-            # download and extract movie_reviews.zip
-            paddle.dataset.common.download(
-                URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
-            path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
-            filename = os.path.join(path, 'movie_reviews.zip')
-            zip_file = zipfile.ZipFile(filename)
-            zip_file.extractall(path)
-            zip_file.close()
-            # make sure that nltk can find the data
-            if paddle.dataset.common.DATA_HOME not in nltk.data.path:
-                nltk.data.path.append(paddle.dataset.common.DATA_HOME)
-            movie_reviews.categories()
-        except LookupError:
-            print("Downloading movie_reviews data set, please wait.....")
-            nltk.download(
-                'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-            print("Download data set success.....")
-            print("Path is " + nltk.data.find('corpora/movie_reviews').path)
-
-    def __getitem__(self, idx):
-        if self.mode == 'test':
-            idx += NUM_TRAINING_INSTANCES
-        data = self.data[idx]
-        return np.array(data[0]), np.array(data[1])
-
-    def __len__(self):
-        if self.mode == 'train':
-            return NUM_TRAINING_INSTANCES
-        else:
-            return NUM_TOTAL_INSTANCES - NUM_TRAINING_INSTANCES
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index c2d4be7cda10d580af44154e6a03e0871ec20706..b30d5992f9adf792f0bae90e19b9c00c4d47c0a2 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -12,20 +12,19 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
+import paddle
+import paddle.nn as nn
 
 __all__ = ['LeNet']
 
 
-class LeNet(fluid.dygraph.Layer):
+class LeNet(nn.Layer):
     """LeNet model from
     `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
 
     Args:
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 10.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -35,28 +34,27 @@ class LeNet(fluid.dygraph.Layer):
             model = LeNet()
     """
 
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
+    def __init__(self, num_classes=10):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
+        self.features = nn.Sequential(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2))
 
         if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+            self.fc = nn.Sequential(
+                nn.Linear(400, 120), nn.Linear(120, 84), nn.Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
 
         if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
         return x
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 10defbf593dca642386e73b65094612f93dce9dc..39654122e3b33e52f4693653dbdd14d6e513228e 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -24,85 +22,66 @@ __all__ = ['MobileNetV1', 'mobilenet_v1']
 model_urls = {
     'mobilenetv1_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
-     'bf0d25cb0bed1114d9dac9384ce2b4a6')
+     '42a154c2f26f86e7457d6daded114e8c')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(nn.Layer):
     def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride,
                  padding,
-                 channels=None,
-                 num_groups=1,
-                 act='relu',
-                 use_cudnn=True,
-                 name=None):
+                 num_groups=1):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
             stride=stride,
             padding=padding,
             groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"),
             bias_attr=False)
 
-        self._batch_norm = BatchNorm(
-            num_filters,
-            act=act,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
+        self._norm_layer = nn.BatchNorm2d(out_channels)
+        self._act = nn.ReLU()
 
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        return y
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._norm_layer(x)
+        x = self._act(x)
+        return x
 
 
-class DepthwiseSeparable(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters1,
-                 num_filters2,
-                 num_groups,
-                 stride,
-                 scale,
-                 name=None):
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self, in_channels, out_channels1, out_channels2, num_groups,
+                 stride, scale):
         super(DepthwiseSeparable, self).__init__()
 
         self._depthwise_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=int(num_filters1 * scale),
-            filter_size=3,
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
             stride=stride,
             padding=1,
-            num_groups=int(num_groups * scale),
-            use_cudnn=False)
+            num_groups=int(num_groups * scale))
 
         self._pointwise_conv = ConvBNLayer(
-            num_channels=int(num_filters1 * scale),
-            filter_size=1,
-            num_filters=int(num_filters2 * scale),
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
             stride=1,
             padding=0)
 
-    def forward(self, inputs):
-        y = self._depthwise_conv(inputs)
-        y = self._pointwise_conv(y)
-        return y
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
 
 
-class MobileNetV1(fluid.dygraph.Layer):
+class MobileNetV1(nn.Layer):
     """MobileNetV1 model from
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
 
@@ -111,7 +90,6 @@ class MobileNetV1(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -121,11 +99,7 @@ class MobileNetV1(fluid.dygraph.Layer):
             model = MobileNetV1()
     """
 
-    def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         super(MobileNetV1, self).__init__()
         self.scale = scale
         self.dwsl = []
@@ -133,18 +107,17 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.with_pool = with_pool
 
         self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            channels=3,
-            num_filters=int(32 * scale),
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
             stride=2,
             padding=1)
 
         dws21 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(32 * scale),
-                num_filters1=32,
-                num_filters2=64,
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
                 num_groups=32,
                 stride=1,
                 scale=scale),
@@ -153,9 +126,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws22 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(64 * scale),
-                num_filters1=64,
-                num_filters2=128,
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
                 num_groups=64,
                 stride=2,
                 scale=scale),
@@ -164,9 +137,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws31 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=128,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
                 num_groups=128,
                 stride=1,
                 scale=scale),
@@ -175,9 +148,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws32 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=256,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
                 num_groups=128,
                 stride=2,
                 scale=scale),
@@ -186,9 +159,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws41 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=256,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
                 num_groups=256,
                 stride=1,
                 scale=scale),
@@ -197,9 +170,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws42 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=512,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
                 num_groups=256,
                 stride=2,
                 scale=scale),
@@ -209,9 +182,9 @@ class MobileNetV1(fluid.dygraph.Layer):
         for i in range(5):
             tmp = self.add_sublayer(
                 sublayer=DepthwiseSeparable(
-                    num_channels=int(512 * scale),
-                    num_filters1=512,
-                    num_filters2=512,
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
                     num_groups=512,
                     stride=1,
                     scale=scale),
@@ -220,9 +193,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws56 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(512 * scale),
-                num_filters1=512,
-                num_filters2=1024,
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
                 num_groups=512,
                 stride=2,
                 scale=scale),
@@ -231,9 +204,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws6 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(1024 * scale),
-                num_filters1=1024,
-                num_filters2=1024,
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
                 num_groups=1024,
                 stride=1,
                 scale=scale),
@@ -241,29 +214,23 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.dwsl.append(dws6)
 
         if with_pool:
-            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > -1:
-            self.out = Linear(
-                int(1024 * scale),
-                num_classes,
-                act=classifier_activation,
-                param_attr=ParamAttr(
-                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-                bias_attr=ParamAttr(name="fc7_offset"))
-
-    def forward(self, inputs):
-        y = self.conv1(inputs)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if num_classes > 0:
+            self.fc = nn.Linear(int(1024 * scale), num_classes)
+
+    def forward(self, x):
+        x = self.conv1(x)
         for dws in self.dwsl:
-            y = dws(y)
+            x = dws(x)
 
         if self.with_pool:
-            y = self.pool2d_avg(y)
+            x = self.pool2d_avg(x)
 
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, 1024])
-            y = self.out(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -275,7 +242,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index c08fb88f8bdb234fec99ed139aa7eb6249965c79..bab8b7b2b1b93bb17612843bf0032ee278c3e93f 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -14,9 +14,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+
+import paddle.nn as nn
+import paddle.nn.functional as F
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -25,221 +25,166 @@ __all__ = ['MobileNetV2', 'mobilenet_v2']
 model_urls = {
     'mobilenetv2_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams',
-     '8ff74f291f72533f2a7956a4efff9d88')
+     '0340af0a901346c8d46f4529882fb63d')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 padding,
-                 channels=None,
-                 num_groups=1,
-                 use_cudnn=True):
-        super(ConvBNLayer, self).__init__()
-
-        tmp_param = ParamAttr(name=self.full_name() + "_weights")
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=tmp_param,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(
-            num_filters,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-
-    def forward(self, inputs, if_act=True):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if if_act:
-            y = fluid.layers.relu6(y)
-        return y
-
-
-class InvertedResidualUnit(fluid.dygraph.Layer):
-    def __init__(
-            self,
-            num_channels,
-            num_in_filter,
-            num_filters,
-            stride,
-            filter_size,
-            padding,
-            expansion_factor, ):
-        super(InvertedResidualUnit, self).__init__()
-        num_expfilter = int(round(num_in_filter * expansion_factor))
-        self._expand_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-        self._bottleneck_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            num_groups=num_expfilter,
-            use_cudnn=False)
-
-        self._linear_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-    def forward(self, inputs, ifshortcut):
-        y = self._expand_conv(inputs, if_act=True)
-        y = self._bottleneck_conv(y, if_act=True)
-        y = self._linear_conv(y, if_act=False)
-        if ifshortcut:
-            y = fluid.layers.elementwise_add(inputs, y)
-        return y
-
-
-class InvresiBlocks(fluid.dygraph.Layer):
-    def __init__(self, in_c, t, c, n, s):
-        super(InvresiBlocks, self).__init__()
-
-        self._first_block = InvertedResidualUnit(
-            num_channels=in_c,
-            num_in_filter=in_c,
-            num_filters=c,
-            stride=s,
-            filter_size=3,
-            padding=1,
-            expansion_factor=t)
-
-        self._inv_blocks = []
-        for i in range(1, n):
-            tmp = self.add_sublayer(
-                sublayer=InvertedResidualUnit(
-                    num_channels=c,
-                    num_in_filter=c,
-                    num_filters=c,
-                    stride=1,
-                    filter_size=3,
-                    padding=1,
-                    expansion_factor=t),
-                name=self.full_name() + "_" + str(i + 1))
-            self._inv_blocks.append(tmp)
-
-    def forward(self, inputs):
-        y = self._first_block(inputs, ifshortcut=False)
-        for inv_block in self._inv_blocks:
-            y = inv_block(y, ifshortcut=True)
-        return y
-
-
-class MobileNetV2(fluid.dygraph.Layer):
-    """MobileNetV2 model from
-    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
-
-    Args:
-        scale (float): scale of channels in each layer. Default: 1.0.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 
-            from paddle.vision.models import MobileNetV2
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
 
-            model = MobileNetV2()
-    """
 
+class ConvBNReLU(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 norm_layer=nn.BatchNorm2d):
+        padding = (kernel_size - 1) // 2
+
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups=groups,
+                bias_attr=False),
+            norm_layer(out_planes),
+            nn.ReLU6())
+
+
+class InvertedResidual(nn.Layer):
     def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+                 inp,
+                 oup,
+                 stride,
+                 expand_ratio,
+                 norm_layer=nn.BatchNorm2d):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvBNReLU(
+                    inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+        layers.extend([
+            ConvBNReLU(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                norm_layer=norm_layer),
+            nn.Conv2d(
+                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            norm_layer(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Layer):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        """MobileNetV2 model from
+        `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+        Args:
+            scale (float): scale of channels in each layer. Default: 1.0.
+            num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                                will not be defined. Default: 1000.
+            with_pool (bool): use pool before the last fc layer or not. Default: True.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.vision.models import MobileNetV2
+
+                model = MobileNetV2()
+        """
         super(MobileNetV2, self).__init__()
-        self.scale = scale
         self.num_classes = num_classes
         self.with_pool = with_pool
+        input_channel = 32
+        last_channel = 1280
+
+        block = InvertedResidual
+        round_nearest = 8
+        norm_layer = nn.BatchNorm2d
+        inverted_residual_setting = [
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
 
-        bottleneck_params_list = [
-            (1, 16, 1, 1),
-            (6, 24, 2, 2),
-            (6, 32, 3, 2),
-            (6, 64, 4, 2),
-            (6, 96, 3, 1),
-            (6, 160, 3, 2),
-            (6, 320, 1, 1),
+        input_channel = _make_divisible(input_channel * scale, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
+                                            round_nearest)
+        features = [
+            ConvBNReLU(
+                3, input_channel, stride=2, norm_layer=norm_layer)
         ]
 
-        self._conv1 = ConvBNLayer(
-            num_channels=3,
-            num_filters=int(32 * scale),
-            filter_size=3,
-            stride=2,
-            padding=1)
-
-        self._invl = []
-        i = 1
-        in_c = int(32 * scale)
-        for layer_setting in bottleneck_params_list:
-            t, c, n, s = layer_setting
-            i += 1
-            tmp = self.add_sublayer(
-                sublayer=InvresiBlocks(
-                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
-                name='conv' + str(i))
-            self._invl.append(tmp)
-            in_c = int(c * scale)
-
-        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
-        self._conv9 = ConvBNLayer(
-            num_channels=in_c,
-            num_filters=self._out_c,
-            filter_size=1,
-            stride=1,
-            padding=0)
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * scale, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(
+                        input_channel,
+                        output_channel,
+                        stride,
+                        expand_ratio=t,
+                        norm_layer=norm_layer))
+                input_channel = output_channel
+
+        features.append(
+            ConvBNReLU(
+                input_channel,
+                self.last_channel,
+                kernel_size=1,
+                norm_layer=norm_layer))
+
+        self.features = nn.Sequential(*features)
 
         if with_pool:
-            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > 0:
-            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-            self._fc = Linear(
-                self._out_c,
-                num_classes,
-                act=classifier_activation,
-                param_attr=tmp_param,
-                bias_attr=ParamAttr(name="fc10_offset"))
-
-    def forward(self, inputs):
-        y = self._conv1(inputs, if_act=True)
-        for inv in self._invl:
-            y = inv(y)
-        y = self._conv9(y, if_act=True)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(0.2), nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.features(x)
 
         if self.with_pool:
-            y = self._pool2d_avg(y)
+            x = self.pool2d_avg(x)
+
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
-            y = self._fc(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -251,7 +196,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index da0c3e9eb3f67f0aad67cdef3c5527cb2275e844..f9e00aefd6bb2b6d0b7bf75055cc735c6651a52d 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -15,11 +15,8 @@
 from __future__ import division
 from __future__ import print_function
 
-import math
-import paddle.fluid as fluid
-
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -29,143 +26,129 @@ __all__ = [
 
 model_urls = {
     'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
-                 '0ba53eea9bc970962d0ef96f7b94057e'),
+                 'cf548f46534aa3560945be4b95cd11c4'),
     'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
-                 '46bc9f7c3dd2e55b7866285bee91eff3'),
+                 '8d2275cf8706028345f78ac0e1d31969'),
     'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
-                 '5ce890a9ad386df17cf7fe2313dca0a1'),
+                 'ca6f485ee1ab0492d38f323885b0ad80'),
     'resnet101': ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
-                  'fb07a451df331e4b0bb861ed97c3a9b9'),
+                  '02f35f034ca3858e1e54d4036443c92d'),
     'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
-                  'f9c700f26d3644bb76ad2226ed5f5713'),
+                  '7ad16a2f1e7333859ff986138630fd7a'),
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class BasicBlock(nn.Layer):
+    expansion = 1
+
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
+                 inplanes,
+                 planes,
                  stride=1,
+                 downsample=None,
                  groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(num_filters, act=act)
-
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-
-        return x
-
-
-class BasicBlock(fluid.dygraph.Layer):
-    """residual block of resnet18 and resnet34
-    """
-    expansion = 1
-
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=3,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
+        if dilation > 1:
+            raise NotImplementedError(
+                "Dilation > 1 not supported in BasicBlock")
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters,
-                filter_size=1,
-                stride=stride)
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias_attr=False)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv2(out)
+        out = self.bn2(out)
 
-        y = short + conv1
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(y)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class BottleneckBlock(fluid.dygraph.Layer):
-    """residual block of resnet50, resnet101 amd resnet152
-    """
+
+class BottleneckBlock(nn.Layer):
 
     expansion = 4
 
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BottleneckBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+
+        self.conv1 = nn.Conv2d(inplanes, width, 1, bias_attr=False)
+        self.bn1 = norm_layer(width)
+
+        self.conv2 = nn.Conv2d(
+            width,
+            width,
+            3,
+            padding=dilation,
             stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * self.expansion,
-            filter_size=1,
-            act=None)
+            groups=groups,
+            dilation=dilation,
+            bias_attr=False)
+        self.bn2 = norm_layer(width)
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * self.expansion,
-                filter_size=1,
-                stride=stride)
+        self.conv3 = nn.Conv2d(
+            width, planes * self.expansion, 1, bias_attr=False)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-        self._num_channels_out = num_filters * self.expansion
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-    def forward(self, inputs):
-        x = self.conv0(inputs)
-        conv1 = self.conv1(x)
-        conv2 = self.conv2(conv1)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv3(out)
+        out = self.bn3(out)
 
-        x = fluid.layers.elementwise_add(x=short, y=conv2)
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(x)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class ResNet(fluid.dygraph.Layer):
+
+class ResNet(nn.Layer):
     """ResNet model from
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
@@ -175,7 +158,6 @@ class ResNet(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -189,82 +171,87 @@ class ResNet(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 Block,
-                 depth=50,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, block, depth, num_classes=1000, with_pool=True):
         super(ResNet, self).__init__()
-
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-
-        layer_config = {
+        layer_cfg = {
             18: [2, 2, 2, 2],
             34: [3, 4, 6, 3],
             50: [3, 4, 6, 3],
             101: [3, 4, 23, 3],
-            152: [3, 8, 36, 3],
+            152: [3, 8, 36, 3]
         }
-        assert depth in layer_config.keys(), \
-            "supported depth are {} but input layer is {}".format(
-                layer_config.keys(), depth)
-
-        layers = layer_config[depth]
-
-        in_channels = 64
-        out_channels = [64, 128, 256, 512]
-
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-        self.layers = []
-        for idx, num_blocks in enumerate(layers):
-            blocks = []
-            shortcut = False
-            for b in range(num_blocks):
-                if b == 1:
-                    in_channels = out_channels[idx] * Block.expansion
-                block = Block(
-                    num_channels=in_channels,
-                    num_filters=out_channels[idx],
-                    stride=2 if b == 0 and idx != 0 else 1,
-                    shortcut=shortcut)
-                blocks.append(block)
-                shortcut = True
-            layer = self.add_sublayer("layer_{}".format(idx),
-                                      Sequential(*blocks))
-            self.layers.append(layer)
+        layers = layer_cfg[depth]
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self._norm_layer = nn.BatchNorm2d
+
+        self.inplanes = 64
+        self.dilation = 1
 
+        self.conv1 = nn.Conv2d(
+            3,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias_attr=False)
+        self.bn1 = self._norm_layer(self.inplanes)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
         if with_pool:
-            self.global_pool = Pool2D(
-                pool_size=7, pool_type='avg', global_pooling=True)
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
 
         if num_classes > 0:
-            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
-            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-            self.fc = Linear(
-                self.fc_input_dim,
-                num_classes,
-                act=classifier_activation,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        x = self.conv(inputs)
-        x = self.pool(x)
-        for layer in self.layers:
-            x = layer(x)
-
-        if self.with_pool:
-            x = self.global_pool(x)
-
-        if self.num_classes > -1:
-            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    1,
+                    stride=stride,
+                    bias_attr=False),
+                norm_layer(planes * block.expansion), )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, 1, 64,
+                  previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool > 0:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
+
         return x
 
 
@@ -277,7 +264,7 @@ def _resnet(arch, Block, depth, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.set_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 8bfacda2476d0e24e549513b379181bf47e40d45..d11845b6616267d1cdfff197cc2c4a25a62c7d9e 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -28,39 +27,18 @@ __all__ = [
 
 model_urls = {
     'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
-              'c788f453a3b999063e8da043456281ee')
+              '89bbffc0f87d260be9b8cdc169c991c4')
 }
 
 
-class Classifier(fluid.dygraph.Layer):
-    def __init__(self, num_classes, classifier_activation='softmax'):
-        super(Classifier, self).__init__()
-        self.linear1 = Linear(512 * 7 * 7, 4096)
-        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes)
-        self.act = Softmax()  #Todo: accept any activation
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear2(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear3(x)
-        out = self.act(x)
-        return out
-
-
-class VGG(fluid.dygraph.Layer):
+class VGG(nn.Layer):
     """VGG model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
 
     Args:
-        features (fluid.dygraph.Layer): vgg features create by function make_layers.
+        features (nn.Layer): vgg features create by function make_layers.
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -76,44 +54,41 @@ class VGG(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 features,
-                 num_classes=1000,
-                 classifier_activation='softmax'):
+    def __init__(self, features, num_classes=1000):
         super(VGG, self).__init__()
         self.features = features
-        self.num_classes = num_classes
-
-        if num_classes > 0:
-            classifier = Classifier(num_classes, classifier_activation)
-            self.classifier = self.add_sublayer("classifier",
-                                                Sequential(classifier))
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, num_classes), )
 
     def forward(self, x):
         x = self.features(x)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.classifier(x)
+        x = self.avgpool(x)
+        x = paddle.flatten(x, 1)
+        x = self.classifier(x)
         return x
 
 
 def make_layers(cfg, batch_norm=False):
     layers = []
     in_channels = 3
-
     for v in cfg:
         if v == 'M':
-            layers += [Pool2D(pool_size=2, pool_stride=2)]
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
         else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
             if batch_norm:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v), ReLU()]
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU()]
             else:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, ReLU()]
+                layers += [conv2d, nn.ReLU()]
             in_channels = v
-    return Sequential(*layers)
+    return nn.Sequential(*layers)
 
 
 cfgs = {
@@ -144,7 +119,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/setup.py.in b/python/setup.py.in
index d85a23a5edd31f77514b468731097759f47533c1..467c5cb86779b80e51794cf800226d64534e8676 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -155,6 +155,7 @@ packages=['paddle',
           'paddle.distributed.fleet.utils',
           'paddle.framework',
           'paddle.jit',
+          'paddle.inference',
           'paddle.fluid',
           'paddle.fluid.inference',
           'paddle.fluid.dygraph',
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 943b8c01e8cc0c0e0a41e9b01951939f454c3181..1db3f6d3d27ec356a78946ceb7e14825a5c40526 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -39,8 +39,8 @@ fi
 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
 if [ "$api_spec_diff" != "" ]; then
-    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
-    echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
+    echo_line="APIs without core.ops: \n${api_spec_diff}\n"
+    echo_line="${echo_line}You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
     echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n"
     check_approval 1 6888866 43953930
 fi
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index b10e76a4b4d037bfa0d72e74e660cf696f5ee1d3..a50d08354b8b4572516296546fb2cc87548b5349 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
     make -j8 && make install && \
     ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
 
-# gcc4.8 TRT
-RUN mkdir -p /opt/compiler && cd /opt/compiler && \
-    wget -q https://paddle-ci.gz.bcebos.com/gcc-4.8.2.tar.gz && \
-    tar xf gcc-4.8.2.tar.gz && rm -f gcc-4.8.2.tar.gz
+# Downgrade gcc&&g++
+<install_gcc>
 
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 3716084487e6ab8ba89cb25698e209a1e1859a8b..9c8f8f563abb73915002fe675d64440c72100d23 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){
 
 function make_centos_dockerfile(){
   dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
-  sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
+  sed 's/<baseimg>/10.2-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
   sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name} 
   dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
   sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \
@@ -29,6 +29,15 @@ function make_centos_dockerfile(){
      rm -rf /usr/include/NvInfer*" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i 's#<install_gcc>#WORKDIR /usr/bin \
+      COPY tools/dockerfile/build_scripts /build_scripts \
+      RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
+      RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
 }
 
 
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index 1858bd0fd17aac7273318ddbb37fc0d9c512f48d..c1e2903c092ce4124c55566679e081dbe3a03445 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -57,7 +57,14 @@ FILE_WHITE_LIST="\
     random_crop_op.h \
     elementwise_op_function.cu.h \
     fused_elemwise_activation_op.cc \
-    auc_op.cu"
+    auc_op.cu \
+    unsqueeze_op.h \
+    unsqueeze_op.cc \
+    enforce.h \
+    errors_test.cc \
+    cross_entropy.cu \
+    cross_entropy.h \
+    unpooling.cu"
 
 function count_file_recursively(){
     dir_name=$1
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 033b4b8723aa30465cdb07198f470d7c09a0f326..d23c18a44e936628f8a63fe9ebca86c1f61a5cd0 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -534,13 +534,6 @@ def get_incrementapi():
                 f.write('\n')
 
 
-# only white on CPU
-gpu_not_white = [
-    "deformable_conv", "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
-    "cuda_profiler", 'DGCMomentumOptimizer'
-]
-
-
 def get_wlist():
     '''
     this function will get the white list of API.
@@ -552,17 +545,25 @@ def get_wlist():
     '''
     wlist = []
     wlist_file = []
+    # only white on CPU
+    gpu_not_white = []
     with open("wlist.json", 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
-            if key == 'wlist_file':
-                wlist_file = wlist_file + load_dict[key]
+            if key == 'wlist_dir':
+                for item in load_dict[key]:
+                    wlist_file.append(item["name"])
+            elif key == "gpu_not_white":
+                gpu_not_white = load_dict[key]
+            elif key == "wlist_api":
+                for item in load_dict[key]:
+                    wlist.append(item["name"])
             else:
                 wlist = wlist + load_dict[key]
-    return wlist, wlist_file
+    return wlist, wlist_file, gpu_not_white
 
 
-wlist, wlist_file = get_wlist()
+wlist, wlist_file, gpu_not_white = get_wlist()
 
 if len(sys.argv) < 2:
     print("Error: inadequate number of arguments")
diff --git a/tools/wlist.json b/tools/wlist.json
index 20f6a9cbaedb391995b3757612ec24f2061a8a81..0ed0b4e40698ce26fbddb7e5a421143749b3a3ef 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -1,11 +1,29 @@
 {
-    "wlist_file" : [
-        "../python/paddle/fluid/contrib", 
-        "../python/paddle/verison.py",
-        "../python/paddle/fluid/core_avx.py",
-        "../python/paddle/distributed"
+    "wlist_dir":[
+        {
+            "name":"../python/paddle/fluid/contrib",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/verison.py",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/fluid/core_avx.py",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/distributed",
+            "annotation":""
+        }
     ],
-    "wlist_inneed":[
+    "wlist_api":[
+        {
+            "name":"xxxxx",
+            "annotation":"not a real api, just for example"
+        }
+    ],
+    "wlist_temp_api":[
         "append_LARS",
         "BuildStrategy.debug_graphviz_path",
         "BuildStrategy.enable_sequential_execution",
@@ -63,9 +81,7 @@
         "cuda_places",
         "CUDAPinnedPlace",
         "CUDAPlace",
-        "Program.parse_from_string"
-    ],
-    "wlist_nosample":[
+        "Program.parse_from_string",
         "Compressor",
         "Compressor.config",
         "Compressor.run",
@@ -105,8 +121,6 @@
         "convert_dist_to_sparse_program",
         "load_persistables_for_increment",
         "load_persistables_for_inference",
-        "cache",
-        "buffered",
         "xmap_readers",
         "Metric.reset",
         "Metric.update",
@@ -161,13 +175,9 @@
         "RNN",
         "BiRNN",
         "RNNCellBase",
-        "RNNCellBase.get_initial_states"
-    ],
-    "wlist_no_op_pass":[
+        "RNNCellBase.get_initial_states",
         "gelu",
-        "erf"
-    ],
-    "wlist_ci_nopass":[
+        "erf",
         "DecodeHelper",
         "DecodeHelper.initialize",
         "DecodeHelper.sample",
@@ -190,9 +200,7 @@
         "SampleEmbeddingHelper",
         "BasicDecoder",
         "lstm",
-        "partial_sum"
-    ],
-    "wlist_nopass":[
+        "partial_sum",
         "StateCell",
         "StateCell.compute_state",
         "TrainingDecoder",
@@ -244,9 +252,7 @@
         "GroupNorm",
         "SpectralNorm",
         "TreeConv",
-        "prroi_pool"
-    ],
-    "wlist_temp":[
+        "prroi_pool",
         "to_tensor",
         "ChunkEvaluator",
         "EditDistance",
@@ -324,9 +330,7 @@
         "Conv2DTranspose",
         "QueueDataset.local_shuffle",
         "save_persistables@dygraph/checkpoint.py",
-        "load_persistables@dygraph/checkpoint.py"
-    ],
-    "wlist_ignore":[
+        "load_persistables@dygraph/checkpoint.py",
         "elementwise_pow",
         "WeightedAverage.reset",
         "ChunkEvaluator.eval",
@@ -403,5 +407,13 @@
         "LinearChainCRF.forward",
         "CRFDecoding.forward",
         "SequenceTagging.forward"
+    ],
+    "gpu_not_white":[
+        "deformable_conv",
+        "cuda_places",
+        "CUDAPinnedPlace",
+        "CUDAPlace",
+        "cuda_profiler",
+        "DGCMomentumOptimizer"
     ]
 }