for merge

03a2be6c · MrChengmo · 51898800 · b38e4f28 · 03a2be6c · 03a2be6c
857 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,7 +63,28 @@ if(WIN32)
        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+        foreach(flag_var
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+            if(${flag_var} MATCHES "/MD")
+                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
            endif()
+        endforeach(flag_var)
+    endif()
+
+    # windows build turn off warnings.
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
+    endforeach(flag_var)

    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")

--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)

 IF(WIN32)
  SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-  set(CompilerFlags
-        CMAKE_CXX_FLAGS
-        CMAKE_CXX_FLAGS_DEBUG
-        CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_C_FLAGS
-        CMAKE_C_FLAGS_DEBUG
-        CMAKE_C_FLAGS_RELEASE
-        )
-  foreach(CompilerFlag ${CompilerFlags})
-    string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
-  endforeach()
 ELSE(WIN32)
  SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)

 set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
                        -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
                        -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
    endif()
 endmacro()

-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()

 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -229,20 +215,3 @@ endforeach()

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")

-
-if(WIN32)
-    # windows build turn off warnings.
-    if(MSVC_STATIC_CRT)
-        safe_set_static_flag()
-    endif()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-endif()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -446,6 +446,9 @@ function(nv_library TARGET_NAME)
        message(FATAL "Please specify source file or library in nv_library.")
      endif()
    endif(nv_library_SRCS)
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
  endif()
 endfunction(nv_library)

@@ -461,6 +464,9 @@ function(nv_binary TARGET_NAME)
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
      common_link(${TARGET_NAME})
    endif()
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
  endif()
 endfunction(nv_binary)

@@ -482,6 +488,9 @@ function(nv_test TARGET_NAME)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
  endif()
 endfunction(nv_test)

@@ -712,6 +721,7 @@ function(proto_library TARGET_NAME)
  set(proto_hdrs)
  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
+  add_dependencies(extern_xxhash ${TARGET_NAME})
 endfunction()

 function(py_proto_compile TARGET_NAME)

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -19,12 +19,11 @@ set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
 set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
  "A path setting paddle inference shared and static libraries")
  
-# TODO(zhaolong)
-# At present, the size of static lib in Windows exceeds the system limit,
-# so the generation of static lib is temporarily turned off.
+# At present, the size of static lib in Windows is very large,
+# so we need to crop the library size.
 if(WIN32)
    #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
+    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
    if(NOT PYTHON_EXECUTABLE)
        FIND_PACKAGE(PythonInterp REQUIRED)
    endif()
@@ -165,25 +164,22 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
    if(WITH_STATIC_LIB)
-        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib)
+        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
    else()
        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
    endif()
-else(WIN32)
-    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
-endif(WIN32)
-
-if(WIN32 AND NOT WITH_STATIC_LIB)
    copy(inference_lib_dist
            SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
            DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-else()
+else(WIN32)
+    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
    copy(inference_lib_dist
                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-endif()
+endif(WIN32)

 copy(inference_lib_dist
        SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -199,7 +195,11 @@ set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_insta
 copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})

 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
+if(WIN32)
+  set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
+else(WIN32)
+  set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
+endif(WIN32)

 copy(inference_lib_dist
      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
@@ -211,12 +211,12 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})

 set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
-if(WIN32 AND NOT WITH_STATIC_LIB)
+if(WIN32)
        copy(fluid_lib_dist
                SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
                )
-else()
+        else()
        copy(fluid_lib_dist
                SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} 

--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -26,4 +26,7 @@ if(WITH_GPU)
    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
 endif()

+if(WIN32)
+    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
+endif()

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,8 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"fused_bn_add_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()

--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
+<?xml version="1.0" encoding="utf-8"?>
+<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+    <ItemDefinitionGroup>
+        <CudaCompile>
+            <!-- Project schema: Host properties -->
+            <UseHostDefines>true</UseHostDefines>
+            <Emulation>false</Emulation>
+            <HostDebugInfo Condition="'$(Configuration)' == 'Debug'">true</HostDebugInfo>
+            <HostDebugInfo Condition="'$(Configuration)' != 'Debug'">false</HostDebugInfo>
+            <FastMath>false</FastMath>
+            <Optimization>InheritFromHost</Optimization>
+            <Runtime>InheritFromHost</Runtime>
+            <RuntimeChecks>InheritFromHost</RuntimeChecks>
+            <TypeInfo>InheritFromHost</TypeInfo>
+            <Warning>InheritFromHost</Warning>
+
+            <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env</BuildCommandLineTemplate>
+            <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
+            <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
+            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
+            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
+
+            <DriverApiCommandLineTemplate>%(BaseCommandLineTemplate) [CompileOut] "%(FullPath)"</DriverApiCommandLineTemplate>
+            <RuntimeApiCommandLineTemplate>%(BaseCommandLineTemplate) [HostDebugInfo] [Emulation] [FastMath] [Defines] %(HostCommandLineTemplate) [CompileOut] "%(FullPath)"</RuntimeApiCommandLineTemplate>
+
+            <CommandLineTemplate>
+# (Approximate command-line.  Settings inherited from host are not visible below.)
+# (Please see the output window after a build for the full command-line)
+
+# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
+set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
+"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(DriverApiCommandLineTemplate)
+
+# Runtime API (NVCC Compilation Type is hybrid object or .c file)
+set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
+"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(RuntimeApiCommandLineTemplate)
+            </CommandLineTemplate>
+            <ExecutionDescription>Compiling CUDA source file %(Identity)...</ExecutionDescription>
+            <ExclusionDescription>Skipping CUDA source file %(Identity) (excluded from build).</ExclusionDescription>
+
+            <!-- Miscellaneous -->
+            <PropsCacheOutputFile>%(Filename)%(Extension).cache</PropsCacheOutputFile>
+            <PropsCacheOutputPath>$(IntDir)%(PropsCacheOutputFile)</PropsCacheOutputPath>
+
+            <CudaCompileCoreProject>$(MSBuildProjectFullPath)</CudaCompileCoreProject>
+        </CudaCompile>
+
+        <CudaLink>
+            <PerformDeviceLink>true</PerformDeviceLink>
+            <LinkOut>$(IntDir)$(TargetName).device-link.obj</LinkOut>
+
+            <AdditionalLibraryDirectories></AdditionalLibraryDirectories>
+            <UseHostLibraryDirectories>true</UseHostLibraryDirectories>
+            <AdditionalDependencies></AdditionalDependencies>
+            <UseHostLibraryDependencies>true</UseHostLibraryDependencies>
+
+            <GPUDebugInfo>InheritFromProject</GPUDebugInfo>
+            <Optimization>InheritFromProject</Optimization>
+
+            <!-- Implicitly inherited from the project via @(CudaCompile) -->
+            <CodeGeneration></CodeGeneration>
+            <RuntimeChecks></RuntimeChecks>
+            <Runtime></Runtime>
+            <TargetMachinePlatform></TargetMachinePlatform>
+            <TypeInfo></TypeInfo>
+            <Warning></Warning>
+
+            <Inputs></Inputs>
+
+            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
+            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
+            <LinkCommandLineTemplate>"$(CudaToolkitNvccPath)" -dlink [LinkOut] %(HostCommandLineTemplate) [AdditionalLibraryDirectories] [AdditionalDependencies] [AdditionalOptions] [CodeGeneration] [GPUDebugInfo] [TargetMachinePlatform] [Inputs]</LinkCommandLineTemplate>
+            <CommandLineTemplate>
+# (Approximate command-line.  Settings inherited from host are not visible below.)
+# (Please see the output window after a build for the full command-line)
+
+%(LinkCommandLineTemplate)
+            </CommandLineTemplate>
+        </CudaLink>
+
+        <Link>
+            <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+        </Link>
+
+        <ClCompile>
+            <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+        </ClCompile>
+    </ItemDefinitionGroup>
+</Project>
+
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -39,6 +39,7 @@ set(third_party_deps)
 #            REPOSITORY ${TARGET_REPOSITORY}
 #            TAG        ${TARGET_TAG}
 #            DIR        ${TARGET_SOURCE_DIR})
+
 FUNCTION(cache_third_party TARGET)
    SET(options "")
    SET(oneValueArgs URL REPOSITORY TAG DIR)
@@ -269,6 +270,10 @@ if(WITH_PSLIB)
    endif()
 endif(WITH_PSLIB)

+if(NOT WIN32 AND NOT APPLE)
+    include(external/gloo)
+    list(APPEND third_party_deps extern_gloo)
+endif()

 if(WITH_BOX_PS)
    include(external/box_ps)
@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)

 if(WITH_DISTRIBUTE)
-    if(WITH_GLOO)
-        include(external/gloo)
-        list(APPEND third_party_deps extern_gloo)
-    endif()

    if(WITH_GRPC)
        list(APPEND third_party_deps extern_grpc)

--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -14,8 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/attribute.h"

-#include <vector>
-
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -30,6 +30,8 @@ namespace paddle {
 namespace framework {

 class ProgramDesc;
+class OpDesc;
+class VarDesc;

 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes

--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/c/c_api.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/init.h"

 extern "C" {


--- a/paddle/fluid/framework/c/c_api.h
+++ b/paddle/fluid/framework/c/c_api.h
@@ -24,6 +24,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+class OpInfoMap;
+}  // namespace framework
+namespace platform {
+class DeviceContextPool;
+}  // namespace platform
+}  // namespace paddle
+
 #ifdef __cplusplus
 extern "C" {
 #endif

--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -277,7 +277,7 @@ class ChannelObject {
    size_t finished = 0;
    while (finished < n && WaitForWrite(lock)) {
      size_t m =
-          std::min(n - finished, capacity_ + reading_count_ - data_.size());
+          (std::min)(n - finished, capacity_ + reading_count_ - data_.size());
      for (size_t i = 0; i < m; i++) {
        data_.push_back(std::move(p[finished++]));
      }

--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <cstring>
 #include <random>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"

--- a/paddle/fluid/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
@@ -21,6 +21,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+class Tensor;
+
 void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
                     Tensor* out);


--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -41,6 +41,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/string/string_helper.h"

+namespace paddle {
+namespace framework {
+class DataFeedDesc;
+class LoDTensor;
+class Scope;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {

@@ -418,6 +427,7 @@ class MultiSlotType {

  std::string DebugString() {
    std::stringstream ss;
+
    ss << "\ntype: " << type_ << "\n";
    ss << "offset: ";
    ss << "[";

--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>

-#include "paddle/fluid/framework/data_feed.h"
-
 namespace paddle {
 namespace framework {
+class DataFeed;
+
 typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
 typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
 data_feedMap g_data_feed_map;

--- a/paddle/fluid/framework/data_feed_factory.h
+++ b/paddle/fluid/framework/data_feed_factory.h
@@ -16,10 +16,13 @@ limitations under the License. */

 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_feed.h"

 namespace paddle {
 namespace framework {
+class DataFeed;
+
 class DataFeedFactory {
 public:
  static std::string DataFeedTypeList();

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,8 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/framework/data_layout_transform.h"
+
 #include <string>
-#include <vector>

 #include "paddle/fluid/operators/math/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -17,10 +17,18 @@
 #include <map>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"

+namespace paddle {
+namespace framework {
+class OpKernelType;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif

--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/data_layout_transform.h"

 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"

 TEST(DataTransform, DataLayoutFunction) {
  auto place = paddle::platform::CPUPlace();

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,8 +18,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"

+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLDNN
-#include <algorithm>
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif


--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,6 +30,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+class OpKernelType;
+class Tensor;
+class Variable;
+
 void TransformData(const OpKernelType &expected_kernel_type,
                   const OpKernelType &kernel_type_for_var,
                   const Tensor &input_tensor, Tensor *out);

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/data_type.h"
-#include <stdint.h>
 #include <string>
 #include <unordered_map>


--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -15,12 +15,19 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <typeindex>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/platform/enforce.h"

+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"

+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -14,9 +14,17 @@
 #include "paddle/fluid/framework/data_type.h"

 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"

+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 TEST(DataType, float16) {
  using paddle::framework::Tensor;
  using paddle::platform::CPUPlace;

--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <utility>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
@@ -23,6 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+class OpKernelType;
+class Tensor;
+
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;

 void TransDataType(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/dataset_factory.cc
+++ b/paddle/fluid/framework/dataset_factory.cc
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/dataset_factory.h"
-#include <memory>
 #include <string>
 #include <unordered_map>

-#include "paddle/fluid/framework/data_set.h"
-
 namespace paddle {
 namespace framework {
 typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)();

--- a/paddle/fluid/framework/dataset_factory.h
+++ b/paddle/fluid/framework/dataset_factory.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_set.h"

 namespace paddle {

--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/dim.h"

 namespace paddle {

--- a/paddle/fluid/framework/ddim_test.cc
+++ b/paddle/fluid/framework/ddim_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <sstream>
-#include <vector>

 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ddim.h"

--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,6 +20,17 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,6 +24,20 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+namespace details {
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -21,13 +21,15 @@

 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
-
 #include "paddle/fluid/platform/device_context.h"

 namespace paddle {
 namespace framework {
 namespace details {

+struct DummyVarHandle;
+struct VarHandle;
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -27,6 +27,18 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"

+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class PassBuilder;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -19,6 +19,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                         platform::Place place,
                                         size_t scope_idx)

--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -24,9 +24,21 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+class OperatorBase;
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 class ComputationOpHandle : public OpHandleBase {
 public:
  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -19,12 +19,23 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"

+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 class Scope;
+class GarbageCollector;
+namespace ir {
+class Node;
+}  // namespace ir

 namespace ir {
 class MemOptVarInfo;

--- a/paddle/fluid/framework/details/exception_holder_test.cc
+++ b/paddle/fluid/framework/details/exception_holder_test.cc
@@ -13,8 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/exception_holder.h"
-#include <memory>
-#include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator.h"


--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -15,9 +15,14 @@
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include <string>
 #include <utility>
-#include <vector>
 #include "paddle/fluid/platform/profiler.h"

+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/fetch_async_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -22,6 +22,18 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+class LoDTensor;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
@@ -19,6 +19,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 FetchBarrierOpHandle::FetchBarrierOpHandle(
    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places)

--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
@@ -24,6 +24,15 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -32,6 +41,8 @@ namespace details {
 // all places if there are multiple places, must init with
 // multiple dev_ctxes_ !!!!

+struct VarHandleBase;
+
 struct FetchBarrierOpHandle : public OpHandleBase {
 public:
  FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,

--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -22,6 +22,17 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -17,10 +17,22 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,6 +25,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -17,11 +17,20 @@
 #include <unordered_map>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle

 namespace paddle {
 namespace framework {
 namespace details {

+struct VarHandle;
+
 struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
  std::vector<std::string> out_varnames_;
  std::vector<std::unique_ptr<ir::Node>> nodes_;

--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -24,6 +24,14 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -17,11 +17,11 @@
 #include <unordered_map>
 #include "gtest/gtest.h"

-#include "paddle/fluid/platform/device_context.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
+struct DummyVarHandle;
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;


--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -20,16 +20,21 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/var_handle.h"
-
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"

-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -19,6 +19,12 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"

+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,10 +24,22 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"

+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {

 class Scope;
+namespace details {
+struct VarHandleBase;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir

 namespace details {


--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -24,6 +24,21 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+class SelectedRows;
+namespace details {
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -24,6 +24,16 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"

+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -13,9 +13,17 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/profiler.h"

+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -21,6 +21,18 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"

+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/scope_buffered_monitor.h
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.h
@@ -17,7 +17,9 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/scope.h"
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -22,6 +22,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"

+namespace paddle {
+namespace framework {
+class Scope;
+class Tensor;
+class Variable;
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -25,6 +25,15 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"

+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -23,10 +23,20 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"

+namespace paddle {
+namespace framework {
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

+class ComputationOpHandle;
+
 ComputationOpHandle *GetUniquePendingComputationOpHandle(
    ShareTensorBufferOpHandle *share_tensor_op) {
  ComputationOpHandle *result_op = nullptr;

--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -22,10 +22,22 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"

+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class MemOptVarInfo;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

+class ComputationOpHandle;
+
 class ShareTensorBufferOpHandle : public OpHandleBase {
 public:
  ShareTensorBufferOpHandle(

--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -23,6 +23,17 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/nccl_helper.h"

+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -24,6 +24,14 @@
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"

+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -15,6 +15,14 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"

 #include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/details/variable_visitor.h
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -17,6 +17,13 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/variable.h"

+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {

--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/device_worker.h"
-#include "xxhash.h"  // NOLINT

 namespace paddle {
 namespace framework {

+class LoDTensor;
+class Scope;
+
 void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }

 void DeviceWorker::SetDataFeed(DataFeed* data_feed) {

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -39,6 +39,18 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/timer.h"

+namespace paddle {
+namespace framework {
+class LoDTensor;
+class ProgramDesc;
+class Scope;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -62,7 +74,9 @@ class PullDenseWorker {
  virtual void Initialize(const TrainerDesc& param);
 #ifdef PADDLE_WITH_CUDA
  void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
+#endif

+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
  void AddPlace(const paddle::platform::Place place) {
    places_.push_back(place);
  }
@@ -123,9 +137,9 @@ class PullDenseWorker {

 #ifdef PADDLE_WITH_CUDA
  std::vector<cudaStream_t> copy_streams_;
+#endif
  std::vector<paddle::platform::Place> places_;
  std::vector<Scope*> thread_scopes_;
-#endif
 };

 // should incorporate different type of device
@@ -149,6 +163,7 @@ class DeviceWorker {
  virtual void SetDataFeed(DataFeed* data_feed);
  virtual void SetWorkerNum(int num) {}
  virtual void CacheProgram(const ProgramDesc& main_program) {}
+  virtual void GetXpuOpIndex() {}
  virtual void SetNeedDumpField(bool need_dump_field) {
    need_dump_field_ = need_dump_field;
  }

--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+class DeviceWorker;
+
 typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)();
 typedef std::unordered_map<std::string, Createdevice_workerFunction>
    device_workerMap;

--- a/paddle/fluid/framework/device_worker_factory.h
+++ b/paddle/fluid/framework/device_worker_factory.h
@@ -16,11 +16,14 @@ limitations under the License. */

 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/device_worker.h"

 namespace paddle {
 namespace framework {

+class DeviceWorker;
+
 class DeviceWorkerFactory {
 public:
  static std::string DeviceWorkerTypeList();

--- a/paddle/fluid/framework/device_worker_test.cc
+++ b/paddle/fluid/framework/device_worker_test.cc
@@ -13,9 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/device_worker.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/trainer.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -128,6 +128,7 @@ message DistributedStrategy {
  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
  optional bool adaptive_localsgd = 24 [ default = false ];
+  optional bool fp16_allreduce = 25 [ default = false ];

  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -11,10 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/dlpack_tensor.h"
 #include <unordered_map>
-
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/dlpack_tensor.h"
+
+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -15,11 +15,14 @@
 #pragma once

 #include <dlpack/dlpack.h>
+
 #include "paddle/fluid/framework/tensor.h"

 namespace paddle {
 namespace framework {

+class Tensor;
+
 class DLPackTensor {
 public:
  using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t

--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -17,6 +17,12 @@
 #include <gtest/gtest.h>
 #include <vector>

+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle

 #if defined _WIN32 || defined __APPLE__
 #else

--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <set>
-#include <unordered_map>
-#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/lodtensor_printer.h"

 namespace paddle {
 namespace framework {

+class OpDesc;
+class OperatorBase;
+class ProgramDesc;
+
 bool HasDependentOutput(const OpDesc& op_desc,
                        const std::unordered_set<std::string>& dependent_vars) {
  for (auto& var : op_desc.Outputs()) {

--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#include <stdint.h>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"


--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
@@ -32,6 +33,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+class Dataset;
+class ProgramDesc;
+class Scope;
+class TrainerBase;
+
 struct ExecutorPrepareContext {
  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);


--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -13,16 +13,19 @@
 // limitations under the License.

 #include "paddle/fluid/framework/executor_gc_helper.h"
+
 #include <deque>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
-#include <vector>
+
 #include "glog/logging.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {

--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
@@ -26,6 +27,10 @@ namespace paddle {
 namespace framework {

 // Result map: op -> variable names that can be deleted after op runs
+class GarbageCollector;
+class OperatorBase;
+class Scope;
+
 std::unordered_map<const OperatorBase *, std::vector<std::string>>
 GetUnusedVars(const BlockDesc &block,
              const std::vector<std::unique_ptr<OperatorBase>> &ops,

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -13,16 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/feed_fetch_method.h"
+
 #include <string>
-#include <vector>
+
 #include "glog/logging.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace framework {

+class LoDTensor;
+class Variable;
+
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
                     const std::string& var_name, size_t index) {
  // If var_name Variable is not found in GlobalScope, a new variable will

--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -15,12 +15,16 @@ limitations under the License. */
 #pragma once

 #include <string>
+
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"

 namespace paddle {
 namespace framework {

+class LoDTensor;
+class Scope;
+
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
                     const std::string& var_name, size_t index);


--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -29,12 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include <algorithm>
 #include <utility>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/timer.h"

 namespace paddle {
 namespace framework {
@@ -751,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
    push_sparse_status->push_back(std::move(status));
  }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+void FleetWrapper::PushDenseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names,
+    std::vector<::std::future<int32_t>>* push_sparse_status,
+    float scale_datanorm, int batch_size,
+    const paddle::platform::Place& place) {
+#ifdef PADDLE_WITH_PSLIB
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g_data = tensor->data<float>();
+
+    Variable* pin_var = scope.FindVar(t + "pin");
+    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
+    float* pin_g =
+        pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+    memory::Copy(platform::CPUPlace(), pin_g,
+                 BOOST_GET_CONST(platform::XPUPlace, place), g_data,
+                 sizeof(float) * count);
+
+    float* g = pin_g;
+    if (scale_datanorm >= 0) {
+      if (t.find(".batch_size@GRAD") != std::string::npos ||
+          t.find(".batch_sum@GRAD") != std::string::npos) {
+        Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
+        float scale = 1.0 / batch_size;
+        mat *= scale;
+      } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
+        VLOG(3) << "epsilon: " << scale_datanorm;
+        for (int i = 0; i < count; ++i) {
+          g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
+                 batch_size * scale_datanorm;
+        }
+      }
+    }
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }

+  auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  if (push_sparse_status) {
+    push_sparse_status->push_back(std::move(status));
+  }
+#endif
+}
 #endif
 void FleetWrapper::PushDenseVarsAsync(
    const Scope& scope, const uint64_t table_id,

--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -35,6 +35,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN

+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {

@@ -154,6 +160,14 @@ class FleetWrapper {
      float scale_datanorm, int batch_size,
      const paddle::platform::Place& place, cudaStream_t stream,
      cudaEvent_t event);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void PushDenseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<::std::future<int32_t>>* push_sparse_status,
+      float scale_datanorm, int batch_size,
+      const paddle::platform::Place& place);
 #endif
  void PushDenseVarsAsync(
      const Scope& scope, const uint64_t table_id,

--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -10,10 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#include <thread>  // NOLINT
-#include <vector>
 #include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/string/string_helper.h"

 namespace gloo {

--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -27,15 +27,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/timer.h"
 #ifdef PADDLE_WITH_PSLIB

 namespace paddle {
@@ -122,29 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
  if (platform::is_cpu_place(tensor->place())) {
    memcpy(data_ptr, tensor->data<void>(),
           tensor->numel() * SizeOfType(tensor->type()));
-  }
+  } else {
 #ifdef PADDLE_WITH_CUDA
-  else {
    memory::Copy(platform::CPUPlace(), data_ptr,
                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                 tensor->data<void>(),
                 tensor->numel() * SizeOfType(tensor->type()), nullptr);
-  }
 #endif
+#ifdef PADDLE_WITH_XPU
+    memory::Copy(platform::CPUPlace(), data_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * SizeOfType(tensor->type()));
+#endif
+  }
 }

-// void HeterWrapper::DeSerializeToTensor(Scope* scope,
-// const HeterRequest* request) {
 #ifdef PADDLE_WITH_CUDA
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                       const VariableMessage& req_var,
                                       platform::Place place,
                                       cudaStream_t stream) {
+  // const VariableMessage& req_var = request->vars();
+  auto* var = scope->FindVar(req_var.varname());
+  auto* tensor = var->GetMutable<LoDTensor>();
+
+  std::vector<int> vec_dim;
+  for (auto& x : req_var.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(make_ddim(vec_dim));
+
+  LoD lod;
+  for (int i = 0; i < req_var.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
+      v.push_back(req_var.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(place, ToVarType(req_var.data_type()));
+
+#ifdef PADDLE_WITH_CUDA
+  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+               platform::CPUPlace(), req_var.data().data(),
+               tensor->numel() * SizeOfType(tensor->type()), stream);
 #else
+  memcpy(tensor_data, req_var.data().data(),
+         tensor->numel() * SizeOfType(tensor->type()));
+#endif
+}
+#endif
+
+// void HeterWrapper::DeSerializeToTensor(Scope* scope,
+// const HeterRequest* request) {
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                       const VariableMessage& req_var,
                                       platform::Place place) {
-#endif
  // const VariableMessage& req_var = request->vars();
  auto* var = scope->FindVar(req_var.varname());
  auto* tensor = var->GetMutable<LoDTensor>();
@@ -168,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
  void* tensor_data =
      tensor->mutable_data(place, ToVarType(req_var.data_type()));

-#ifdef PADDLE_WITH_CUDA
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+#ifdef PADDLE_WITH_XPU
+  memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
               platform::CPUPlace(), req_var.data().data(),
-               tensor->numel() * SizeOfType(tensor->type()), stream);
+               tensor->numel() * SizeOfType(tensor->type()));
 #else
  memcpy(tensor_data, req_var.data().data(),
         tensor->numel() * SizeOfType(tensor->type()));
@@ -192,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
    case VariableMessage::BOOL:
      return framework::proto::VarType::BOOL;  // NOLINT
    default:
-      VLOG(0) << "Not support type " << type;
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "ToVarType:Unsupported type %d", type));
  }
 }

@@ -239,7 +268,7 @@ void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
  request.set_cur_batch(task->cur_batch_);

  OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
-    auto* closure = (OnHeterRpcDone*)done;
+    auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
    if (closure->cntl.Failed()) {
      VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
    } else {

--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -13,9 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/fleet/nccl_wrapper.h"
-#include <utility>
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/scope.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <random>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -29,6 +30,12 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN

+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -19,9 +19,16 @@
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"

+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -16,7 +16,6 @@ limitations under the License. */

 #include <glog/logging.h>
 #include <stdint.h>
-
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug

--- a/paddle/fluid/framework/hetercpu_worker.cc
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -12,13 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
-
 #ifdef PADDLE_WITH_PSLIB

 #if defined _WIN32 || defined __APPLE__

--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -22,9 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
-
+#endif
 namespace paddle {
 namespace framework {

@@ -44,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
  int place_num = trainer_desc.worker_places_size();
  for (int i = 0; i < place_num; ++i) {
    int num = trainer_desc.worker_places(i);
+#ifdef PADDLE_WITH_CUDA
    platform::CUDAPlace place = platform::CUDAPlace(num);
    platform::CUDADeviceGuard guard(place.device);
    cudaStream_t stream;
@@ -54,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
    events_.push_back(event);
+#endif
+#ifdef PADDLE_WITH_XPU
+    platform::XPUPlace place = platform::XPUPlace(num);
+    places_.push_back(place);
+#endif
  }
  // thread_num_ = trainer_desc.thread_num();
  // SetDataset(dataset);
@@ -105,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
 void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
  auto place = places_[num];
  Scope* scope = place_scopes_[num];
+#ifdef PADDLE_WITH_CUDA
  auto stream = copy_streams_[num];
  auto event = events_[num];
-
  auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
  platform::CUDADeviceGuard guard(dev_id);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+  xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+#endif
+
  auto& block = program.Block(0);
  for (auto& var : block.AllVars()) {
    if (var->Persistable()) {
@@ -126,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
    }                                                                   \
  } while (0)
+
+#define HeterMemcpyXpuFunc(cpp_type, proto_type)                \
+  do {                                                          \
+    if (root_tensor->type() == proto_type) {                    \
+      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
+    }                                                           \
+  } while (0)
+#ifdef PADDLE_WITH_CUDA
      _ForEachDataType_(HeterMemcpyFunc);
+#endif
+#ifdef PADDLE_WITH_XPU
+      _ForEachDataType_(HeterMemcpyXpuFunc);
+#endif
    }
  }
+#ifdef PADDLE_WITH_CUDA
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
  cudaEventSynchronize(event);
+#endif
 }

+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                                  LoDTensor* root_tensor,
@@ -151,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                 root_ptr, sizeof(T) * root_tensor->numel(), stream);
  }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+template <typename T>
+void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
+                                  LoDTensor* root_tensor,
+                                  const paddle::platform::Place& thread_place) {
+  T* thread_ptr =
+      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
+  T* root_ptr = root_tensor->data<T>();
+  if (platform::is_cpu_place(root_tensor->place())) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 platform::CPUPlace(), root_ptr,
+                 sizeof(T) * root_tensor->numel());
+  } else {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
+                 root_ptr, sizeof(T) * root_tensor->numel());
+  }
+}
+#endif

 void HeterXpuTrainer::DumpWork(int tid) {}

@@ -181,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
    CreateThreadParam(main_program, i);
    pull_dense_worker_->AddThreadScope(scope);
    pull_dense_worker_->AddPlace(places_[i]);
+#ifdef PADDLE_WITH_CUDA
    pull_dense_worker_->AddStream(copy_streams_[i]);
+#endif
  }
-
  pull_dense_worker_->Start();
+#ifdef PADDLE_WITH_CUDA
  for (auto& stream : copy_streams_) {
    cudaStreamSynchronize(stream);
  }
+#endif
  op_names_.clear();
  for (auto& op_desc : block.AllOps()) {
    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
@@ -230,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
        OperatorBase* local_op_ptr = local_op.release();
        (context->ops_).push_back(local_op_ptr);
      }
+#ifdef PADDLE_WITH_CUDA
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
      platform::CUDADeviceGuard guard(dev_id);
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
      object_pool_.Push(context);
    }
  }
@@ -277,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
    }                                                                          \
  } while (0)
      _ForEachDataType_(MergeCallback);
-      if (platform::is_gpu_place(thread_tensor->place())) {
+      if (!platform::is_cpu_place(thread_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
        auto dev_id =
            BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
        platform::CUDADeviceGuard guard(dev_id);
        cudaMemset(thread_tensor->data<void>(), 0,
                   thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+        auto place = thread_tensor->place();
+        xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        platform::DeviceContext* dev_ctx = pool.Get(place);
+        const platform::XPUDeviceContext* xpu_ctx =
+            reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+        xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
+                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
      } else {
        memset(thread_tensor->data<void>(), 0,
               thread_tensor->numel() * SizeOfType(thread_tensor->type()));
@@ -291,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
    auto* merge_var = response->add_vars();
    heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
                               merge_var);
-    if (platform::is_gpu_place(root_tensor->place())) {
+    if (!platform::is_cpu_place(root_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
      auto dev_id =
          BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
      platform::CUDADeviceGuard guard(dev_id);
      cudaMemset(root_tensor->data<void>(), 0,
                 root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+      auto place = root_tensor->place();
+      xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::DeviceContext* dev_ctx = pool.Get(place);
+      const platform::XPUDeviceContext* xpu_ctx =
+          reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+      xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
+                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
    } else {
      memset(root_tensor->data<void>(), 0,
             root_tensor->numel() * SizeOfType(root_tensor->type()));
@@ -334,7 +415,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
  std::shared_ptr<HeterServiceContext> context = object_pool_.Get();

  if (!context->scope_) {
-    int num = rand() % places_.size();
+    int num = rand_r() % places_.size();
    context->place_num_ = num;
    auto place = places_[num];
    context->scope_ = &(place_scopes_[num]->NewScope());
@@ -356,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
      OperatorBase* local_op_ptr = local_op.release();
      (context->ops_).push_back(local_op_ptr);
    }
-
+#ifdef PADDLE_WITH_CUDA
    auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
    platform::CUDADeviceGuard guard(dev_id);
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
  }

  context->Reset();
@@ -369,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
    auto deserial_timer =
        std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
    for (int i = 0; i < request->vars_size(); ++i) {
+#ifdef PADDLE_WITH_CUDA
      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
                                      copy_streams_[context->place_num_]);
+#endif
+#ifdef PADDLE_WITH_XPU
+      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
+#endif
    }
+#ifdef PADDLE_WITH_CUDA
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
    while (cudaEventQuery(context->event_) != cudaSuccess) {
      VLOG(3) << "wait for kernel";
      bthread_yield();
    }
+#endif
  }

  {
@@ -388,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
      op->Run(*(context->scope_), place);
    }
  }
+#ifdef PADDLE_WITH_CUDA
  auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
      platform::DeviceContextPool::Instance().Get(place));
  PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -401,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
      bthread_yield();
    }
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  xpu_wait();
+#endif

  for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
    const std::string& varname = trainer_desc_.xpu_send_list(i);
@@ -417,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       ++i) {
    uint64_t tid =
        static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
+#ifdef PADDLE_WITH_CUDA
    fleet_ptr_->PushDenseVarsAsync(
        *(context->scope_), tid, dense_grad_names_[tid],
        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
        places_[context->place_num_], copy_streams_[context->place_num_],
        context->event_);
+#endif
+#ifdef PADDLE_WITH_XPU
+    fleet_ptr_->PushDenseVarsAsync(
+        *(context->scope_), tid, dense_grad_names_[tid],
+        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
+        places_[context->place_num_]);
+#endif
  }
  for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
       ++i) {
@@ -463,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
  pull_dense_worker_->Stop();
  root_scope_->DropKids();
 }
-
 }  // namespace framework
 }  // namespace paddle
 #endif
--- a/paddle/fluid/framework/inlined_vector.h
+++ b/paddle/fluid/framework/inlined_vector.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cstdint>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {

--- a/paddle/fluid/framework/inlined_vector_test.cc
+++ b/paddle/fluid/framework/inlined_vector_test.cc
@@ -13,10 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/inlined_vector.h"
+
 #include <cstdlib>
 #include <ctime>
-#include <iostream>
-#include <vector>
+
 #include "gtest/gtest.h"

 namespace paddle {

--- a/paddle/fluid/framework/io/crypto/cipher_utils.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.cc
@@ -15,8 +15,6 @@
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"

 #include <cryptopp/osrng.h>
-
-#include <fstream>
 #include <sstream>

 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/io/crypto/cipher_utils.h
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.h
@@ -17,6 +17,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_map>
+
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/io/fs.h
+++ b/paddle/fluid/framework/io/fs.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/io/shell.h"
 #include "paddle/fluid/string/string_helper.h"

--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -32,6 +32,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/string_helper.h"


--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {

+class Graph;
+
 class AttentionLSTMFusePass : public FusePassBase {
 protected:
  void ApplyImpl(ir::Graph* graph) const override;

--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/cudnn_placement_pass.h
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.h
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.h
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
--- a/paddle/fluid/framework/ir/fusion_group/operation.h
+++ b/paddle/fluid/framework/ir/fusion_group/operation.h
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
--- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
--- a/paddle/fluid/framework/ir/placement_pass_base.h
+++ b/paddle/fluid/framework/ir/placement_pass_base.h
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
--- a/paddle/fluid/framework/ir/subgraph_detector.h
+++ b/paddle/fluid/framework/ir/subgraph_detector.h
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ b/paddle/fluid/framework/mixed_vector_test.cc
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
--- a/paddle/fluid/framework/op_call_stack.h
+++ b/paddle/fluid/framework/op_call_stack.h
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ b/paddle/fluid/framework/op_compatible_info_test.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
--- a/paddle/fluid/framework/rw_lock_test.cc
+++ b/paddle/fluid/framework/rw_lock_test.cc
--- a/paddle/fluid/framework/save_load_util.h
+++ b/paddle/fluid/framework/save_load_util.h
--- a/paddle/fluid/framework/save_load_util_test.cc
+++ b/paddle/fluid/framework/save_load_util_test.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/scope_pool.cc
+++ b/paddle/fluid/framework/scope_pool.cc
--- a/paddle/fluid/framework/scope_pool.h
+++ b/paddle/fluid/framework/scope_pool.h
--- a/paddle/fluid/framework/scope_test.cc
+++ b/paddle/fluid/framework/scope_test.cc
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
--- a/paddle/fluid/framework/trainer_factory.h
+++ b/paddle/fluid/framework/trainer_factory.h
--- a/paddle/fluid/framework/transfer_scope_cache.h
+++ b/paddle/fluid/framework/transfer_scope_cache.h
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
--- a/paddle/fluid/framework/tuple_test.cc
+++ b/paddle/fluid/framework/tuple_test.cc
--- a/paddle/fluid/framework/unroll_array_ops.h
+++ b/paddle/fluid/framework/unroll_array_ops.h
--- a/paddle/fluid/framework/unroll_array_ops_test.cc
+++ b/paddle/fluid/framework/unroll_array_ops_test.cc
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
--- a/paddle/fluid/framework/unused_var_check.h
+++ b/paddle/fluid/framework/unused_var_check.h
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
--- a/paddle/fluid/imperative/jit/op_desc_meta.cc
+++ b/paddle/fluid/imperative/jit/op_desc_meta.cc
--- a/paddle/fluid/imperative/jit/op_desc_meta.h
+++ b/paddle/fluid/imperative/jit/op_desc_meta.h
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
--- a/paddle/fluid/imperative/jit/program_desc_tracer.h
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.h
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
+++ b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
+++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
--- a/paddle/fluid/memory/allocation/mmap_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator_test.cc
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
--- a/paddle/fluid/operators/assign_op_test.cc
+++ b/paddle/fluid/operators/assign_op_test.cc
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
--- a/paddle/fluid/operators/common_infer_shape_functions.h
+++ b/paddle/fluid/operators/common_infer_shape_functions.h
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
--- a/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
--- a/paddle/fluid/operators/dequantize_abs_max_op.h
+++ b/paddle/fluid/operators/dequantize_abs_max_op.h
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
--- a/paddle/fluid/operators/dequantize_log_op.h
+++ b/paddle/fluid/operators/dequantize_log_op.h
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ b/paddle/fluid/operators/diag_v2_op.cu
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ b/paddle/fluid/operators/distributed/collective_client.cc
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ b/paddle/fluid/operators/distributed/collective_server.cc
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ b/paddle/fluid/operators/distributed/communicator_test.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.h
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.h
--- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
--- a/paddle/fluid/operators/enqueue_op.cc
+++ b/paddle/fluid/operators/enqueue_op.cc
--- a/paddle/fluid/operators/eye_op.h
+++ b/paddle/fluid/operators/eye_op.h
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/fluid/operators/isfinite_v2_op.h
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
--- a/paddle/fluid/operators/jit/gen/embseqpool.h
+++ b/paddle/fluid/operators/jit/gen/embseqpool.h
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
--- a/paddle/fluid/operators/jit/gen/gru.h
+++ b/paddle/fluid/operators/jit/gen/gru.h
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ b/paddle/fluid/operators/jit/gen/lstm.h
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
--- a/paddle/fluid/operators/jit/gen/sgd.h
+++ b/paddle/fluid/operators/jit/gen/sgd.h
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
--- a/paddle/fluid/operators/jit/gen/vbroadcast.h
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.h
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
--- a/paddle/fluid/operators/jit/kernel_pool.cc
+++ b/paddle/fluid/operators/jit/kernel_pool.cc
--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ b/paddle/fluid/operators/jit/kernel_pool.h
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
--- a/paddle/fluid/operators/math/cos_sim_functor.h
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
--- a/paddle/fluid/operators/math/sample_prob.cc
+++ b/paddle/fluid/operators/math/sample_prob.cc
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
--- a/paddle/fluid/operators/reader/py_reader.cc
+++ b/paddle/fluid/operators/reader/py_reader.cc
--- a/paddle/fluid/operators/reader/py_reader.h
+++ b/paddle/fluid/operators/reader/py_reader.h
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
--- a/paddle/fluid/operators/tensor_formatter.h
+++ b/paddle/fluid/operators/tensor_formatter.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
--- a/paddle/fluid/platform/cuda_resource_pool.h
+++ b/paddle/fluid/platform/cuda_resource_pool.h
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/errors_test.cc
+++ b/paddle/fluid/platform/errors_test.cc
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ b/paddle/fluid/platform/lodtensor_printer.h
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ b/paddle/fluid/platform/lodtensor_printer_test.cc
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/string/piece_test.cc
+++ b/paddle/fluid/string/piece_test.cc
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/fluid/string/string_helper.cc
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
--- a/paddle/scripts/installation_validate.py
+++ b/paddle/scripts/installation_validate.py
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
--- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
--- a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
--- a/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
--- a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_minimum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
--- a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
--- a/python/paddle/tensor/io.py
+++ b/python/paddle/tensor/io.py
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
--- a/python/paddle/tests/test_dataset_movie_reviews.py
+++ b/python/paddle/tests/test_dataset_movie_reviews.py
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
--- a/python/paddle/text/datasets/__init__.py
+++ b/python/paddle/text/datasets/__init__.py
--- a/python/paddle/text/datasets/movie_reviews.py
+++ b/python/paddle/text/datasets/movie_reviews.py
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
--- a/tools/wlist.json
+++ b/tools/wlist.json