diff --git a/.gitignore b/.gitignore
index 708126b3bb070f0ce3b4e751b8732b77af8b36c4..e905833cae7a60f46f6d8fddf5403d46808873f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,8 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
-paddle/pten/api/*/api*
+paddle/pten/api/*/api.*
+paddle/pten/api/*/backward*
 paddle/pten/include/*
 paddle/pten/extension.h
 
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index c7a6f04b5f40a202d849d91d3d07a8bfb3ea7fff..578fb1621603f203ca85aefbcdd27ae51481172a 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220104")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220116")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/fluid/distributed/service/brpc_utils.h b/paddle/fluid/distributed/service/brpc_utils.h
index 556bbb1048e2c4cca4aef337235e3c476deb859b..ebae710acc28b58a503bc9c0b455ef7c5ca10cff 100644
--- a/paddle/fluid/distributed/service/brpc_utils.h
+++ b/paddle/fluid/distributed/service/brpc_utils.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace butil {
 class IOBuf;
@@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
                                         const framework::Scope* scope);
 
 void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
-                          butil::IOBufBytesIterator& iobuf,
+                          butil::IOBufBytesIterator& iobuf,  // NOLINT
                           const platform::DeviceContext& ctx);
 
 void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
-                             butil::IOBufBytesIterator& iobuf,
+                             butil::IOBufBytesIterator& iobuf,  // NOLINT
                              const platform::DeviceContext& ctx);
 
 std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port);
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index 3b00f1d6ccc3a1b66ff2b3f146aa33b15fa9c41b..3408ef5f91ad009a33c28fb4093a79075112c0bd 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -40,9 +40,9 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace distributed {
@@ -202,7 +202,7 @@ class ValueBlock {
       // value = _alloc.acquire(value_length_);
       table[id] = value;
     } else {
-      value = (VALUE *)(void *)(res->second);
+      value = (VALUE *)(void *)(res->second);  // NOLINT
     }
     return value;
   }
@@ -282,8 +282,8 @@ class ValueBlock {
         value->unseen_days_++;
         if (value->unseen_days_ >= threshold) {
           butil::return_object(iter->second);
-          //_alloc.release(iter->second);
-          //_alloc.release(value);
+          // _alloc.release(iter->second);
+          // _alloc.release(value);
           iter = table.erase(iter);
         } else {
           ++iter;
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index ffd76c5bda62125bd2e38ed003cf001a9556081d..826b02b3db0720c0d158a778c8f441e312085c5c 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -216,8 +216,9 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
 
 #define PADDLE_TENSOR_ADD(cpp_type)                                          \
   if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
-    TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(),     \
-                                    dst_tensor->mutable_data<cpp_type>());   \
+    TensorAddFunctor<cpp_type> func(                                         \
+        numel, src_tensor->data<cpp_type>(),                                 \
+        dst_tensor->mutable_data<cpp_type>(place));                          \
     paddle::platform::VisitPlace(place, func);                               \
     return;                                                                  \
   }
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 668e60d857b9ca371243891db686421810fda0bb..c504a126ddecaebfcb55313573d6bc490007feef 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_subdirectory(final_state_generator)
+#add_subdirectory(final_state_generator)
 
 set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
 
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 4400e01b8a2d302ae638fcdbcaa016c0b92f8534..b74cdcf78dcb3b55eb7cb27459100fe6eb22ac9b 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -75,6 +75,10 @@ def GetAutoGradMetaName(string):
     return f"{string}_autograd_meta"
 
 
+def GetAutoGradMetaVectorName(string):
+    return f"{string}_autograd_meta_vec"
+
+
 ######################
 ###  File Readers  ###
 ######################
@@ -219,10 +223,6 @@ def ParseYamlBackward(args_str, returns_str):
 def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
                             forward_returns_list, orig_forward_inputs_list,
                             orig_forward_attrs_list, orig_forward_returns_list):
-    # inputs_list          = [ [input_name, input_type, orig_position], ...]
-    # attrs_list           = [ [attr_name, attr_type, default_value, orig_position], ...]
-    # forward_returns_list = [ [ret_name, ret_type, orig_position] ...]
-    # orig_returns_list    = [ [ret_type, orig_position], ...]
     for i in range(len(forward_inputs_list)):
         forward_input_name = forward_inputs_list[i][0]
         forward_input_type = forward_inputs_list[i][1]
@@ -270,9 +270,6 @@ def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
 
 def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
                             backward_attrs_list):
-    # backward_fwd_input_map   = { "name" : [type, is_fwd_input, orig_position] ...}
-    # backward_grad_input_map  = { "name" : [type, fwd_position, orig_position] ...}
-    # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
 
     # Check Order: TensorWrappers, GradTensors, Attributes
     max_fwd_input_position = -1
@@ -291,10 +288,6 @@ def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
 
 
 def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
-    # inputs_list          = [ [input_name, input_type, orig_position], ...]
-    # forward_returns_list = [ [ret_name, ret_type, orig_position] ...]
-
-    # forward_position_map = { "name" : [type, fwd_position] ...}
     forward_inputs_position_map = {}
     forward_outputs_position_map = {}
     for i in range(len(forward_inputs_list)):
@@ -319,15 +312,6 @@ def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
 def SlotNameMatching(backward_inputs_list, backward_returns_list,
                      forward_inputs_position_map, forward_outputs_position_map):
 
-    # backward_inputs_list  = [ [input_name, input_type, orig_position], ...]
-    # backward_returns_list = [ [ret_name, ret_type, orig_position], ...]
-    # forward_inputs_position_map  = { "name" : [type, fwd_position] }
-    # forward_outputs_position_map = { "name" : [type, fwd_position] }
-
-    # backward_fwd_input_map   = { "name" : [type, is_fwd_input, orig_position] ...}
-    # backward_grad_input_map  = { "name" : [type, fwd_position, orig_position] ...}
-    # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
-
     backward_fwd_input_map = {}
     backward_grad_input_map = {}
     backward_grad_output_map = {}
@@ -580,7 +564,14 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     compute_require_grad_args_list = ["trace_backward"]
     for name, (ttype, pos) in forward_inputs_position_map.items():
         input_autograd_meta_name = GetAutoGradMetaName(name)
-        input_autograd_meta = f"    auto* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
+        if IsPlainTensorType(ttype):
+            input_autograd_meta = f"    egr::EagerTensor* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
+        else:
+            assert IsVectorTensorType(ttype)
+            input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
+            input_autograd_meta = f"    std::vector<egr::EagerTensor*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
+            input_autograd_meta += f"    std::vector<egr::EagerTensor*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+
         inputs_autograd_meta_list.append(input_autograd_meta)
         compute_require_grad_args_list.append(input_autograd_meta_name)
     inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
@@ -592,11 +583,23 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     num_fwd_outputs = len(forward_outputs_position_map.keys())
     for name, (rtype, pos) in forward_outputs_position_map.items():
         output_autograd_meta_name = GetAutoGradMetaName(name)
+        output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
         if num_fwd_outputs == 1:
-            output_autograd_meta = f"    auto* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs);"
+            if IsPlainTensorType(rtype):
+                output_autograd_meta = f"    egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs);"
+            else:
+                assert IsVectorTensorType(rtype)
+                output_autograd_meta = f"    std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({outputs});\n"
+                output_autograd_meta += f"    std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
         else:
             # Tuple api_result
-            outputs_autograd_meta = f"    auto* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs[{pos}]);"
+            if IsPlainTensorType(rtype):
+                outputs_autograd_meta = f"    egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs[{pos}]);"
+            else:
+                assert IsVectorTensorType(rtype)
+                output_autograd_meta = f"    std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta(outputs[{pos}]);\n"
+                output_autograd_meta += f"    std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+
         outputs_autograd_meta_list.append(output_autograd_meta)
         pass_stop_gradient_args_list.append(output_autograd_meta_name)
 
@@ -786,7 +789,6 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     
     auto outputs = {};
     
-    // Node Creation
 {}
 
     // Returns
@@ -903,17 +905,10 @@ if __name__ == "__main__":
         # Collect Forward Inputs/Outputs
         forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
             bwd_forward_str)
-        print("Parsed Forward Inputs List: ", forward_inputs_list)
-        print("Prased Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Forward Returns List: ", forward_returns_list)
 
         # Collect Original Forward Inputs/Outputs and then perform validation checks
         orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
             fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
-        print("Parsed Original Forward Returns List: ",
-              orig_forward_returns_list)
 
         # Forward Validation Checks
         ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
@@ -924,25 +919,15 @@ if __name__ == "__main__":
         # Parse Backward Inputs/Outputs
         backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
             bwd_args_str, bwd_returns_str)
-        print("Parsed Backward Inputs List: ", backward_inputs_list)
-        print("Prased Backward Attrs List: ", backward_attrs_list)
-        print("Parsed Backward Returns List: ", backward_returns_list)
 
         # Determine Forward Inputs/Outputs Position
         forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
             forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
 
         # SlotName Matching
         backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
             backward_inputs_list, backward_returns_list,
             forward_inputs_position_map, forward_outputs_position_map)
-        print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
-        print("Generated Backward Grad Input Map: ", backward_grad_input_map)
-        print("Generated Backward Grad Output Map: ", backward_grad_output_map)
 
         # Backward Validation Check
         BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
@@ -951,13 +936,11 @@ if __name__ == "__main__":
         # Node Declaration Generation
         node_declaration_str += GenerateNodeDeclaration(
             fwd_api_name, backward_fwd_input_map, backward_attrs_list)
-        print("Generated Node Declaration: ", node_declaration_str)
 
         node_definition_str += GenerateNodeDefinition(
             fwd_api_name, bwd_api_name, backward_fwd_input_map,
             backward_grad_input_map, backward_grad_output_map,
             backward_attrs_list)
-        print("Generated Node Definition: ", node_definition_str)
 
         # Node Definition Generation
         definition_declaration_pair = GenerateForwardDefinition(
@@ -965,8 +948,6 @@ if __name__ == "__main__":
             forward_outputs_position_map, forward_attrs_list,
             backward_fwd_input_map, backward_grad_input_map,
             backward_grad_output_map, backward_attrs_list)
-        print("Generated Forward Definition: ", forward_definition_str)
-        print("Generated Forward Declaration: ", forward_declaration_str)
         forward_definition_str += definition_declaration_pair[0]
         forward_declaration_str += definition_declaration_pair[1]
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index cdc970100951362eaa30692142b674a74a19064f..64f980d709ad99c6061ceda0759d12deaaac7dd8 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -36,7 +36,8 @@ TEST(AccumulationNode, EagerTensor) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  dt0->mutable_data<paddle::platform::float16>()[0] = 10.0;
+  dt0->mutable_data<paddle::platform::float16>(
+      paddle::platform::CPUPlace())[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
@@ -45,7 +46,8 @@ TEST(AccumulationNode, EagerTensor) {
           .get(),
       meta);
 
-  dt1->mutable_data<paddle::platform::float16>()[0] = 20.0;
+  dt1->mutable_data<paddle::platform::float16>(
+      paddle::platform::CPUPlace())[0] = 20.0;
   EagerTensor et1 = EagerTensor(dt1);
 
   std::shared_ptr<pten::DenseTensor> grad_dt =
diff --git a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
index 3d45dc831d41124351cfa74507dd7c1371546186..1c5102f7a21a7f94c3587f315de60ec8c022c89b 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
@@ -46,7 +46,7 @@ TEST(AutogradMeta, MemberFunction) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  auto* dt_ptr = dt->mutable_data<float>();
+  auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
   dt_ptr[0] = 5.0f;
   dt_ptr[1] = 10.0f;
   grad_t->set_impl(dt);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index a483ddb6a98f6ddf0262a0c56cc5051872865047..620fa52cac6db9bbb638814729b006a736fef2f3 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -40,7 +40,7 @@ TEST(EagerTensor, Constructor) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  auto* dt_ptr = dt->mutable_data<float>();
+  auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
   dt_ptr[0] = 5.0f;
   dt_ptr[1] = 10.0f;
   egr::EagerTensor et3 = egr::EagerTensor(dt);
@@ -70,7 +70,7 @@ TEST(EagerTensor, MemberFunction) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  auto* dt_ptr = dt->mutable_data<float>();
+  auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
   dt_ptr[0] = 5.0f;
   dt_ptr[1] = 10.0f;
   VLOG(6) << "Make Dense Tensor";
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index 19850b3210b7630e4071933f5d5149366a200b34..ea4b4a480e2c079660f8fbb00d55ee1eb41bdba6 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -45,7 +45,7 @@ TEST(GradNodeInfo, GradNodeBase) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  auto* dt_ptr = dt->mutable_data<float>();
+  auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
   dt_ptr[0] = 5.0f;
   egr::EagerTensor et1(dt);
   grads = {{et1}};
@@ -102,7 +102,7 @@ TEST(GradNodeInfo, GradNodeBase) {
             paddle::platform::CPUPlace())
             .get(),
         meta);
-    auto* dt_ptr = dt->mutable_data<float>();
+    auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
     dt_ptr[0] = 6.0f;
     auto* et_ptr =
         std::dynamic_pointer_cast<pten::DenseTensor>(et.impl())->data<float>();
@@ -121,8 +121,8 @@ TEST(GradNodeInfo, GradNodeBase) {
 
   VLOG(6) << "Test Reduce Hook";
   auto reduce_hook = [&](void) -> void {
-    auto* et_ptr = std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl())
-                       ->mutable_data<float>();
+    auto* et_ptr =
+        std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl())->data<float>();
     et_ptr[0] = 100.0;
     VLOG(6) << "Running Reduce Hook";
   };
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 433a00e27be0e90800f1dffedf19b358c8fc9f56..bf9c3a93e1636dd67a173308ffe5b5df7916319c 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -41,7 +41,7 @@ class GradTestNode : public egr::GradNodeBase {
             paddle::platform::CPUPlace())
             .get(),
         meta);
-    auto* dt_ptr = dt->mutable_data<float>();
+    auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
     dt_ptr[0] = 6.0f;
     egr::EagerTensor et1(dt);
     std::vector<std::vector<egr::EagerTensor>> res = {{et1}};
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index c88a5f5fdcef5701ca007d8a67682239922ee59a..c2830bf7ef6afa46a21889a8ac9a45a1a4c352ee 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -57,7 +57,7 @@ TEST(GradTensorHolder, Interfaces) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  dt0->mutable_data<float>()[0] = 10.0;
+  dt0->mutable_data<float>(paddle::platform::CPUPlace())[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
@@ -65,7 +65,7 @@ TEST(GradTensorHolder, Interfaces) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  dt1->mutable_data<float>()[0] = 20.0;
+  dt1->mutable_data<float>(paddle::platform::CPUPlace())[0] = 20.0;
   EagerTensor et1 = EagerTensor(dt1);
 
   // Constructor empty GradTensorHolder
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 8bc739d455a958c43a581dde19c4c3be850a7caa..742a64ecec23975e2081ef16f2476206ff751442 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -29,7 +29,7 @@ TEST(TensorWrapper, Basic) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  auto* dt_ptr = dt->mutable_data<float>();
+  auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
   dt_ptr[0] = 5.0f;
   dt_ptr[1] = 10.0f;
   et1.set_impl(dt);
@@ -56,7 +56,7 @@ TEST(TensorWrapper, Basic) {
           paddle::platform::CPUPlace())
           .get(),
       meta2);
-  auto* dt_ptr2 = dt->mutable_data<float>();
+  auto* dt_ptr2 = dt->mutable_data<float>(paddle::platform::CPUPlace());
   dt_ptr2[0] = 6.0f;
   dt_ptr2[1] = 11.0f;
   et2.set_impl(dt2);
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 1b2f1287b069d9ef905f0e46336cddaf5a7d551b..3bd5b98a164d63bf778876f8924c54001348028e 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -35,7 +35,7 @@ TEST(EagerUtils, AutoGradMeta) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  dt0->mutable_data<float>()[0] = 10.0;
+  dt0->mutable_data<float>(paddle::platform::CPUPlace())[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
@@ -43,7 +43,7 @@ TEST(EagerUtils, AutoGradMeta) {
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  dt1->mutable_data<float>()[0] = 20.0;
+  dt1->mutable_data<float>(paddle::platform::CPUPlace())[0] = 20.0;
   EagerTensor et1 = EagerTensor(dt1);
 
   std::vector<EagerTensor> ets = {et0, et1};
@@ -112,7 +112,7 @@ egr::EagerTensor CreateTestCPUTensor(T val,
           paddle::platform::CPUPlace())
           .get(),
       meta);
-  auto* dt_ptr = dt->mutable_data<T>();
+  auto* dt_ptr = dt->mutable_data<T>(paddle::platform::CPUPlace());
   for (int64_t i = 0; i < dt->numel(); i++) {
     dt_ptr[i] = val;
   }
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 1fef0905b4cc5a8f3923c8785507f3ce5da046b3..45b7b80049560befa6510220979f12c1476389de 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -44,8 +44,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
           paddle::memory::Alloc(place, bytes_size)),
       std::move(ret_meta));
 
-  float* t_ptr = t_dense->mutable_data<float>();
-  float* ret_ptr = ret_dense->mutable_data<float>();
+  float* t_ptr = t_dense->mutable_data<float>(place);
+  float* ret_ptr = ret_dense->mutable_data<float>(place);
   for (int i = 0; i < ret_dense->numel(); i++) {
     ret_ptr[i] = t_ptr[i] + 5.0;
   }
@@ -184,7 +184,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
   // Examine Forward Output 2
   {
     auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out2.impl());
-    float* ptr = dense_out->mutable_data<float>();
+    float* ptr = dense_out->mutable_data<float>(paddle::platform::CPUPlace());
     for (int i = 0; i < 20; i++) {
       PADDLE_ENFORCE(ptr[i] == 150.0,
                      paddle::platform::errors::Fatal(
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 4f4a33b1a743afdf3f6f5c3652a2d87b3e0499ef..3d61167c52efeaf5a9e6d506a2c0bd6088a71a7f 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -45,8 +45,8 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
           paddle::memory::Alloc(place, bytes_size)),
       std::move(ret_meta));
 
-  float* t_ptr = t_dense->mutable_data<float>();
-  float* ret_ptr = ret_dense->mutable_data<float>();
+  float* t_ptr = t_dense->mutable_data<float>(place);
+  float* ret_ptr = ret_dense->mutable_data<float>(place);
   for (int i = 0; i < ret_dense->numel(); i++) {
     ret_ptr[i] = t_ptr[i] + 3.0;
   }
diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h
index e7f3a89bf06b3f1ed567c8df231576e0179b4777..9c217dff499607343dc346f6cf39cb5a8ba45d2d 100644
--- a/paddle/fluid/eager/tests/test_utils.h
+++ b/paddle/fluid/eager/tests/test_utils.h
@@ -34,7 +34,7 @@ bool CompareGradTensorWithValue(const egr::EagerTensor& target, T value) {
   egr::AutogradMeta* meta = egr::EagerUtils::unsafe_autograd_meta(target);
   auto grad_dense =
       std::dynamic_pointer_cast<pten::DenseTensor>(meta->Grad().impl());
-  T* ptr = grad_dense->mutable_data<T>();
+  T* ptr = grad_dense->data<T>();
 
   std::vector<T> host_data(grad_dense->numel());
   if (paddle::platform::is_gpu_place(grad_dense->place())) {
@@ -67,7 +67,7 @@ template <typename T>
 bool CompareTensorWithValue(const egr::EagerTensor& target, T value) {
   // TODO(jiabin): Support Selected Rows later
   auto dense_t = std::dynamic_pointer_cast<pten::DenseTensor>(target.impl());
-  T* ptr = dense_t->mutable_data<T>();
+  T* ptr = dense_t->data<T>();
 
   std::vector<T> host_data(dense_t->numel());
   if (paddle::platform::is_gpu_place(dense_t->place())) {
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 791e9a83fa09cb946377059ceddc0170380a1dd8..dbde9aa24ff02474a5f231e7f5d556d4af6e8836 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index d8b14fc0d4c3096126c0a5a743320024099e3215..7aadc856129a1302a0f349459636bda5e9456c1b 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -38,8 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/timer.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/expect.h b/paddle/fluid/framework/expect.h
index 146f4de9382a687686d5f7fdd6f4fa2300cb043b..686a14fca15c19205b9242e5ad925f25520f133a 100644
--- a/paddle/fluid/framework/expect.h
+++ b/paddle/fluid/framework/expect.h
@@ -19,14 +19,18 @@
 #define _LINUX
 #endif
 
-#ifdef _LINUX
 #ifndef likely
-#define likely(x) __builtin_expect((x), 1)
+#ifdef _LINUX
+#define likely(expr) (__builtin_expect(!!(expr), 1))
+#else
+#define likely(expr) (expr)
 #endif
 #endif
 
-#ifdef _LINUX
 #ifndef unlikely
-#define unlikely(x) __builtin_expect((x), 0)
+#ifdef _LINUX
+#define unlikely(expr) (__builtin_expect(!!(expr), 0))
+#else
+#define unlikely(expr) (expr)
 #endif
 #endif
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index 004dc71d07bf3795082d3a75d155e533580b0c83..f01894f2cf448130ee58d7716ddaef556c9ce9cd 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "paddle/fluid/framework/io/shell.h"
 
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h
index 6fd00a516de892bdd749b24687a742e04931d354..e92560980f52ca3d9f17fbf76f280c8b65b35b7d 100644
--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -34,8 +34,8 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 #if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \
     defined(__ARM_NEON__)
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 91d618970e30c80507a386ba5ba437931771c637..6c97c7fefb184b033773f835cbf5b48014f4aa6b 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_desc.pb.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 135ef6a970621cea6ee1418f751ffc37406628db..1ef633d0f12ec0e3bf9c7ba0817301170bd9fb16 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 61c5d8d0e4fd76a516837cc202ee56bc8dfd0739..92989eed7c0cb09c2ce71c1dd0e698b8d106bcac 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 #ifdef _WIN32
 #include <direct.h>
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 57052243d2f189ec6f722d5820cba223dd914e4a..e418d412b556023ee73e2e03e992b6604add2cc2 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index c6d25137594b76a1ff67d9fb25b2480372c3eefa..6c0707e3475c7270afb6044e93f87a370f8a6e5d 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -31,8 +31,8 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 extern std::string paddle::framework::DataTypeToString(
     const framework::proto::VarType::Type type);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index cf8a32ba94a1cacc1df9a195de7ff1ae8a790a98..ed0c8e51ac912bee53067c38aba09616d5ac801b 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/errors.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 DECLARE_bool(use_mkldnn);
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index f8a27da00ba2b84e9e60b26f4171053b91f03095..d18ff6f6bfe2f0b04966af9e80bc40f3bebfc593 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -8,7 +8,6 @@ set(pybind_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists.txt.  DO NOT EDIT!\n\n")
 
 add_subdirectory(math)
-add_subdirectory(eigen)
 add_subdirectory(controlflow)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index c5ca1fd0e8cab8b5c27115e9698b475f2fef1b82..e5ba46f312897b78dacd60701b71ca031cb43531 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 DECLARE_bool(use_mkldnn);
 
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
index d2ef4c9befba99290008508e43df6c84f969b710..957efbff1993792c1cc6162296dbdcf00abb61cf 100644
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
@@ -22,49 +22,40 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct LessThanFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const { return a < b; }
-};
-
-template <typename T>
-struct LessEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const { return a <= b; }
-};
-
-template <typename T>
-struct GreaterThanFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const { return a > b; }
-};
-
-template <typename T>
-struct GreaterEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const { return a >= b; }
-};
-
-template <typename T>
+#define COMPARE_FUNCTOR(func_name, op)                           \
+  template <typename InT, typename OutT = bool>                  \
+  struct func_name {                                             \
+    using ELEM_TYPE = InT;                                       \
+    HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
+      return static_cast<OutT>(a op b);                          \
+    }                                                            \
+  };
+
+COMPARE_FUNCTOR(LessThanFunctor, <)
+COMPARE_FUNCTOR(LessEqualFunctor, <=)
+COMPARE_FUNCTOR(GreaterThanFunctor, >)
+COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
+#undef COMPARE_FUNCTOR
+
+template <typename InT, typename OutT = bool>
 struct EqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const {
-    if (std::is_floating_point<T>::value) {
+  using ELEM_TYPE = InT;
+  HOSTDEVICE OutT operator()(const InT a, const InT b) const {
+    if (std::is_floating_point<InT>::value) {
       // This branch will be optimized while compiling if T is integer. It is
       // safe to cast a and b to double.
-      return fabs(static_cast<double>(a - b)) < 1e-8;
+      return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
     } else {
-      return (a == b);
+      return static_cast<OutT>(a == b);
     }
   }
 };
 
-template <typename T>
+template <typename InT, typename OutT = bool>
 struct NotEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const {
-    return !EqualFunctor<T>()(a, b);
+  using ELEM_TYPE = InT;
+  HOSTDEVICE bool operator()(const InT a, const InT b) const {
+    return !EqualFunctor<InT, OutT>()(a, b);
   }
 };
 
diff --git a/paddle/fluid/operators/eigen/CMakeLists.txt b/paddle/fluid/operators/eigen/CMakeLists.txt
deleted file mode 100644
index 8b64e35b93526eb7edbe7f723832126ef7f0e0a6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eigen/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
-if(WITH_GPU)
-  nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
-elseif(WITH_ROCM)
-  hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
-else()
-  cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
-endif()
diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h
index 9a3be7ca439b9aead2e931c7fa3036128400b057..a460e4c04c5f9bedf6fb1d914cf0dcc9096d1332 100644
--- a/paddle/fluid/operators/eigen/eigen_function.h
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -18,243 +18,71 @@ limitations under the License. */
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename EigenDevice, typename T, int Rank>
-struct EigenBroadcast {
-  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using InType32BitIndex =
-      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
-                       Eigen::Aligned>;
-  using OutType = Eigen::TensorMap<
-      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType32BitIndex =
-      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
-                       Eigen::Aligned>;
-  static void Eval(const EigenDevice& dev, OutType out, InType in,
-                   const Array& bcast);
-  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
-                   InType32BitIndex in, const Array& bcast);
-};
+using EigenBroadcast = pten::funcs::EigenBroadcast<EigenDevice, T, Rank>;
 
 template <typename EigenDevice, typename T, int Rank>
-struct EigenBroadcastGrad {
-  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
-  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, InType in,
-                   const Array& reduce_dims, const Array2& reshape_dims);
-};
+using EigenBroadcastGrad =
+    pten::funcs::EigenBroadcastGrad<EigenDevice, T, Rank>;
 
 template <typename EigenDevice, typename T, int Rank>
-struct EigenConstant {
-  using Type = Eigen::TensorMap<
-      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, Type out, const T value);
-};
+using EigenConstant = pten::funcs::EigenConstant<EigenDevice, T, Rank>;
 
 template <typename EigenDevice, typename T>
-struct EigenSign {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
-};
+using EigenSign = pten::funcs::EigenSign<EigenDevice, T>;
 
 template <typename EigenDevice, typename T, int Rank>
-struct EigenReverse {
-  using Array = Eigen::DSizes<bool, Rank>;
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType = Eigen::TensorMap<
-      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
-                   const Array& reverse);
-};
+using EigenReverse = pten::funcs::EigenReverse<EigenDevice, T, Rank>;
 
 template <typename EigenDevice, typename T>
-struct EigenAdd {
-  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
-                   const T value);
-};
+using EigenAdd = pten::funcs::EigenAdd<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenSub {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& left,
-                   const InType& right);
-};
+using EigenSub = pten::funcs::EigenSub<EigenDevice, T>;
 
 template <typename EigenDevice, typename T, int Rank>
-struct EigenSlice {
-  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
-  using Array32Bit = Eigen::DSizes<int, Rank>;
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using InType32BitIndex =
-      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
-                       Eigen::Aligned>;
-  using OutType = Eigen::TensorMap<
-      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType32BitIndex =
-      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
-                       Eigen::Aligned>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
-                   const Array& offsets, const Array& extents);
-  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
-                   const InType32BitIndex& in, const Array32Bit& offsets,
-                   const Array32Bit& extents);
-};
+using EigenSlice = pten::funcs::EigenSlice<EigenDevice, T, Rank>;
 
 template <typename EigenDevice, typename T, int Rank>
-struct EigenPad {
-  using Array = std::array<std::pair<int64_t, int64_t>, Rank>;
-  using Array32Bit = std::array<std::pair<int, int>, Rank>;
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using InType32BitIndex =
-      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
-                       Eigen::Aligned>;
-  using OutType = Eigen::TensorMap<
-      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType32BitIndex =
-      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
-                       Eigen::Aligned>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
-                   const Array& padding, const T value);
-  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
-                   const InType32BitIndex& in, const Array32Bit& padding,
-                   const T value);
-};
+using EigenPad = pten::funcs::EigenPad<EigenDevice, T, Rank>;
 
 template <typename EigenDevice, typename T>
-struct EigenScale {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
-                   const T scale, const T bias, const bool bias_after_scale);
-};
+using EigenScale = pten::funcs::EigenScale<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenErf {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
-};
+using EigenErf = pten::funcs::EigenErf<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenErfGrad {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType din, const InType& in,
-                   const InType& dout);
-};
+using EigenErfGrad = pten::funcs::EigenErfGrad<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenRankLoss {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& label,
-                   const InType& left, const InType& right);
-};
+using EigenRankLoss = pten::funcs::EigenRankLoss<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenRankLossGrad {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void EvalLeft(const EigenDevice& dev, OutType dleft,
-                       const InType& dout, const InType& label,
-                       const InType& left, const InType& right);
-  static void EvalRight(const EigenDevice& dev, OutType dright,
-                        const InType& dout, const InType& label,
-                        const InType& left, const InType& right);
-};
+using EigenRankLossGrad = pten::funcs::EigenRankLossGrad<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenLogLoss {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& pred,
-                   const InType& label, const T& epsilon);
-};
+using EigenLogLoss = pten::funcs::EigenLogLoss<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenLogLossGrad {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
-                   const InType& pred, const InType& label, const T& epsilon);
-};
+using EigenLogLossGrad = pten::funcs::EigenLogLossGrad<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenHingeLoss {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType loss, const InType& pred,
-                   const InType& label);
-};
+using EigenHingeLoss = pten::funcs::EigenHingeLoss<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenHingeLossGrad {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
-                   const InType& pred, const InType& label);
-};
+using EigenHingeLossGrad = pten::funcs::EigenHingeLossGrad<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenL1Norm {
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
-};
+using EigenL1Norm = pten::funcs::EigenL1Norm<EigenDevice, T>;
 
 template <typename EigenDevice, typename T>
-struct EigenL1NormGrad {
-  using Array = Eigen::DSizes<Eigen::DenseIndex, 1>;
-  using InType = Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType =
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const EigenDevice& dev, OutType din, const InType& dout,
-                   const InType& in, const Array& bcast);
-};
+using EigenL1NormGrad = pten::funcs::EigenL1NormGrad<EigenDevice, T>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 6ef49e2cf3db7318f2eb8f0f55ffccd0e3bbad15..f1deab3e65299b5188fbbbb8583705a4560a9ad4 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
 
@@ -41,7 +42,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
             ctx.InputName("X") + ctx.InputName("WeightH")) {
     const bool is_INT8 = std::is_same<T, uint8_t>::value;
 
-    if (!this->isCached()) {
+    if (unlikely(!this->isCached())) {
       // oneDNN kernel has hardcoded activation functions
       PADDLE_ENFORCE_EQ(
           ctx.Attr<std::string>("gate_activation"), "sigmoid",
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index 385e4ad8808a51a207ef8779c4544da60f0a6a3d..dfd88248ede3452bab7a23ea8f3e349e23430349 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
 #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
 
@@ -40,7 +41,7 @@ class LSTMMKLDNNHandler
             ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
             is_reverse, N, Ti, IC, OC, 4,
             ctx.InputName("X") + ctx.InputName("WeightH")) {
-    if (!this->isCached()) {
+    if (unlikely(!this->isCached())) {
       const bool is_INT8 = std::is_same<T, uint8_t>::value;
       const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
       // oneDNN kernel has hardcoded activation functions
diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
index 5e3c1fc202d595cf7406841cae716d3ddcb59d02..03610d4589058e074f64940741df34bd8f66e379 100644
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 67176f26b079fcf294a7db2a6adc2d05d0908a24..f6178eb0a1eb6e8a4d1886443ec77b945c3b182f 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -477,6 +477,155 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
   }
 };
 
+#ifdef PADDLE_WITH_XPU
+template <typename T>
+struct MergeAdd<platform::XPUDeviceContext, T> {
+  framework::SelectedRows operator()(const platform::XPUDeviceContext& context,
+                                     const framework::SelectedRows& input,
+                                     const bool sorted_result = false) {
+    framework::SelectedRows out;
+    (*this)(context, input, &out, sorted_result);
+    return out;
+  }
+
+  void operator()(const platform::XPUDeviceContext& context,
+                  const framework::SelectedRows& input,
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false) {
+    framework::Vector<int64_t> input_rows(input.rows());
+    if (input_rows.size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows& out = *output;
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+    auto input_width = input.value().dims()[1];
+
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+    int r =
+        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
+                         merge_rows.size() * input_width, static_cast<T>(0.f));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External("XPU constant op return"
+                                                 " wrong value[%d %s].",
+                                                 r, XPUAPIErrorMsg[r]));
+
+    std::unordered_map<int64_t, size_t> rows_to_id;
+    for (size_t i = 0; i < merge_rows.size(); ++i) {
+      rows_to_id[merge_rows[i]] = i;
+    }
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+    int n = input_width;
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = rows_to_id[input_rows[i]];
+      auto r = xpu::add(context.x_context(), &input_data[i * input_width],
+                        &out_data[out_i * input_width],
+                        &out_data[out_i * input_width], n);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU API return wrong value[%d %s], ", r,
+                                     XPUAPIErrorMsg[r]));
+    }
+  }
+
+  void operator()(const platform::XPUDeviceContext& context,
+                  const std::vector<const framework::SelectedRows*>& inputs,
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false) {
+    if (inputs.size() == 0) {
+      VLOG(3) << "no input! return";
+      return;
+    }
+    const framework::SelectedRows* has_value_input = nullptr;
+    for (auto* in : inputs) {
+      if (in->rows().size() > 0) {
+        has_value_input = in;
+        break;
+      }
+    }
+    if (has_value_input == nullptr) {
+      VLOG(3) << "no input has value! just return" << std::endl;
+      return;
+    }
+    auto input_width = has_value_input->value().dims()[1];
+    auto input_height = has_value_input->height();
+    framework::SelectedRows& out = *output;
+    std::set<int64_t> merged_row_set;
+    size_t row_num = 0;
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
+      PADDLE_ENFORCE_EQ(input_height, input->height(),
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same height."));
+      row_num += input->rows().size();
+      merged_row_set.insert(input->rows().begin(), input->rows().end());
+    }
+
+    std::vector<int64_t> merge_rows(merged_row_set.begin(),
+                                    merged_row_set.end());
+
+    if (sorted_result) {
+      std::sort(merge_rows.begin(), merge_rows.end());
+    }
+
+    out.set_rows(merge_rows);
+    out.set_height(input_height);
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merged_row_set.size()), input_width}),
+        context.GetPlace());
+
+    int r =
+        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
+                         merge_rows.size() * input_width, static_cast<T>(0.f));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External("XPU constant op return"
+                                                 " wrong value[%d %s].",
+                                                 r, XPUAPIErrorMsg[r]));
+
+    float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
+
+    std::unordered_map<int64_t, size_t> rows_to_id;
+    for (size_t i = 0; i < merge_rows.size(); ++i) {
+      rows_to_id[merge_rows[i]] = i;
+    }
+
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      auto& input_rows = input->rows();
+
+      int n = input_width;
+      for (size_t i = 0; i < input_rows.size(); i++) {
+        size_t out_i = rows_to_id[input_rows[i]];
+        auto r = xpu::add(
+            context.x_context(), input->value().data<T>() + i * input_width,
+            &out_data[out_i * input_width], &out_data[out_i * input_width], n);
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API return wrong value[%d %s], ", r,
+                                       XPUAPIErrorMsg[r]));
+      }
+    }
+  }
+};
+
+#endif
 template <typename T>
 struct MergeAverage<platform::CPUDeviceContext, T> {
   framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
@@ -589,6 +738,10 @@ template struct MergeAdd<platform::CPUDeviceContext,
 template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::bfloat16>;
 
+#ifdef PADDLE_WITH_XPU
+template struct MergeAdd<platform::XPUDeviceContext, float>;
+#endif
+
 template struct MergeAverage<platform::CPUDeviceContext, int>;
 template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
 template struct MergeAverage<platform::CPUDeviceContext, float>;
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index 3038a16dc0a5e53a0b8f7aa49942bcb916191f48..ddfb8d50c4e11ba47b7ac13b9da6405955d0ebf6 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -219,18 +219,20 @@ class MatrixRankCPUKernel : public framework::OpKernel<T> {
     tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
 
     Tensor compare_result;
-    compare_result.mutable_data<int>(detail::NewAxisDim(dim_out, k),
-                                     context.GetPlace());
+    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
+                                         context.GetPlace());
 
     int axis = -1;
     if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
-      ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CPUDeviceContext, T,
-                           int>(context, &eigenvalue_tensor, &tol_tensor, axis,
-                                GreaterThanFunctor<T>(), &compare_result);
+      ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
+                           platform::CPUDeviceContext, T, int>(
+          context, &eigenvalue_tensor, &tol_tensor, axis,
+          GreaterThanFunctor<T, int64_t>(), &compare_result);
     } else {
-      ElementwiseComputeEx<LessThanFunctor<T>, platform::CPUDeviceContext, T,
-                           int>(context, &eigenvalue_tensor, &tol_tensor, axis,
-                                LessThanFunctor<T>(), &compare_result);
+      ElementwiseComputeEx<LessThanFunctor<T, int64_t>,
+                           platform::CPUDeviceContext, T, int>(
+          context, &eigenvalue_tensor, &tol_tensor, axis,
+          LessThanFunctor<T, int64_t>(), &compare_result);
     }
     auto dito_int =
         math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index 1891a7be24e4560e12acdbecd15c2424845066ed..7362d00afb76f21ac3ec227892ad74a35bc90039 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -129,10 +129,10 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> {
     compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
                                          context.GetPlace());
     int axis = -1;
-    ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CUDADeviceContext, T,
-                         int64_t>(context, &eigenvalue_tensor, &tol_tensor,
-                                  axis, GreaterThanFunctor<T>(),
-                                  &compare_result);
+    ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
+                         platform::CUDADeviceContext, T, int64_t>(
+        context, &eigenvalue_tensor, &tol_tensor, axis,
+        GreaterThanFunctor<T, int64_t>(), &compare_result);
     auto dito_int =
         math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
                                                  int64_t>(context);
diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/fluid/operators/matrix_rank_op.h
index c3d99a21b72358df5dedc7741072a7913de174af..550bc445ac4e66a74965fe635a36c95b33dbed29 100644
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/fluid/operators/matrix_rank_op.h
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
 
 namespace paddle {
 namespace operators {
@@ -46,16 +47,6 @@ static DDim RemoveLastDim(const DDim& dim) {
 }
 }  // namespace detail
 
-template <typename T>
-struct GreaterThanFunctor {
-  HOSTDEVICE int operator()(const T a, const T b) const { return a > b; }
-};
-
-template <typename T>
-struct LessThanFunctor {
-  HOSTDEVICE int operator()(const T a, const T b) const { return a < b; }
-};
-
 template <typename T>
 struct GreaterElementFunctor {
   HOSTDEVICE T operator()(const T a, const T b) const {
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 68e2a7c8a91bb232fb479942d307679137b6172a..eef38bf99b1366a46b80e7e0088e838110787c39 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -14,6 +14,7 @@
 
 #include <tuple>
 
+#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -79,7 +80,7 @@ class ConvMKLDNNHandlerT
             dev_ctx, mkldnn_engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
                                 unique_name)) {
-    if (!this->isCached()) {
+    if (unlikely(!this->isCached())) {
       PADDLE_ENFORCE_EQ(
           input->layout(), framework::DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
@@ -264,7 +265,7 @@ class ConvMKLDNNHandlerT
             dev_ctx, dev_ctx.GetEngine(), cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(in->dims()),
                                 unique_name)) {
-    if (!this->isBwdCached()) {
+    if (unlikely(!this->isBwdCached())) {
       PADDLE_ENFORCE_EQ(
           in->layout(), framework::DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
index 8c7113d963bd5214d74b4289dc569e9c33359e57..7119d68d583f0a224860da51793ccc79ecb5b8c4 100644
--- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
@@ -39,7 +40,7 @@ class PReluMKLDNNHandler
             dev_ctx, engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
                                 uniq_name)) {
-    if (!this->isCached()) {
+    if (unlikely(!this->isCached())) {
       auto x_md = memory::desc(framework::vectorize(x->dims()),
                                MKLDNNGetDataType<T>(), x->format());
 
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 0a653c401171948af545709978ea5892b4099a92..e462c20c7f51db8195c3acba019d0aa225005dce 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/adam_op.h"
 #include "gflags/gflags.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -155,6 +156,11 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                         mom2_out.template mutable_data<float>(ctx.GetPlace()),
                         param_out.template mutable_data<float>(ctx.GetPlace()),
                         beta1, beta2, epsilon, param.numel());
+
+      xpu_wait(dev_ctx.x_context()->xpu_stream);
+      PADDLE_ENFORCE_EQ(
+          r == xpu::Error_t::SUCCESS, true,
+          platform::errors::External("XPU API return wrong value[%d],", r));
       if (!use_global_beta_pow) {
         // update in cpu and then copy to xpu
         if (beta1_pow.place() == platform::CPUPlace() &&
@@ -165,7 +171,6 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
           const float* beta2_pow_p = beta2_pow.template data<float>();
           beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
               beta2 * beta2_pow_p[0];
-          xpu_wait(dev_ctx.x_context()->xpu_stream);
         } else {
           float* beta1_pow_out_p =
               beta1_pow_out->mutable_data<float>(ctx.GetPlace());
@@ -177,23 +182,129 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_EQ(
               r, xpu::SUCCESS,
               platform::errors::External(
-                  "XPU kernel scale occur error in adamw error code ", r,
+                  "XPU kernel scale occur error in adam error code ", r,
                   XPUAPIErrorMsg[r]));
           r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p,
                          beta2_pow.numel(), false, beta2, 0.0f);
           PADDLE_ENFORCE_EQ(
               r, xpu::SUCCESS,
               platform::errors::External(
-                  "XPU kernel scale occur error in adamw error code ", r,
+                  "XPU kernel scale occur error in adam error code ", r,
                   XPUAPIErrorMsg[r]));
+
+          xpu_wait(dev_ctx.x_context()->xpu_stream);
+        }
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+      if (grad->rows().size() == 0) {
+        VLOG(3) << "grad row size is 0!!";
+        return;
+      }
+
+      std::vector<int64_t> cpu_rows(grad->rows().begin(), grad->rows().end());
+      bool is_strict_sorted = true;
+      for (size_t i = 1; i < cpu_rows.size(); ++i) {
+        if (cpu_rows[i - 1] >= cpu_rows[i]) {
+          is_strict_sorted = false;
+          break;
         }
+      }
+
+      framework::SelectedRows tmp_grad_merge;
+      const framework::SelectedRows* grad_merge_ptr;
+      if (is_strict_sorted) {
+        grad_merge_ptr = grad;
+      } else {
+        scatter::MergeAdd<platform::XPUDeviceContext, T> merge_func;
+        merge_func(ctx.template device_context<platform::XPUDeviceContext>(),
+                   *grad, &tmp_grad_merge, true);
+
+        xpu_wait(dev_ctx.x_context()->xpu_stream);
+        grad_merge_ptr = &tmp_grad_merge;
+      }
+      const T* beta1_pow_ptr = beta1_pow.template data<T>();
+      const T* beta2_pow_ptr = beta2_pow.template data<T>();
+      Tensor xpu_beta1_pow;
+      Tensor xpu_beta2_pow;
+      if (beta1_pow.place() == platform::CPUPlace() &&
+          beta2_pow.place() == platform::CPUPlace()) {
+        paddle::framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx,
+                                      &xpu_beta1_pow);
+        paddle::framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx,
+                                      &xpu_beta2_pow);
+        dev_ctx.Wait();
+        beta1_pow_ptr = xpu_beta1_pow.template data<T>();
+        beta2_pow_ptr = xpu_beta2_pow.template data<T>();
+      }
+      auto& grad_merge = *grad_merge_ptr;
+      auto& grad_tensor = grad_merge.value();
+      const T* grad_data = grad_tensor.template data<T>();
+      int row_count = grad_merge.rows().size();
+      std::vector<int> rows(row_count);
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      int* xpu_rows = RAII_GUARD.alloc_l3_or_gm<int>(row_count);
+      std::vector<int64_t> merge_rows(grad_merge.rows().begin(),
+                                      grad_merge.rows().end());
+      for (size_t i = 0; i < grad_merge.rows().size(); ++i) {
+        rows[i] = static_cast<int>(merge_rows[i]);
+      }
+      xpu_wait(dev_ctx.x_context()->xpu_stream);
+      memory::Copy(ctx.GetPlace(), xpu_rows, platform::CPUPlace(), rows.data(),
+                   row_count * sizeof(int));
+      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
+      auto ori_rows = param.numel() / row_numel;
 
-        PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                          platform::errors::External(
-                              "XPU API return wrong value[%d], please check "
-                              "where Baidu Kunlun Card is properly installed.",
-                              r));
+      int lazy_mode = static_cast<int>(ctx.Attr<bool>("lazy_mode"));
+      int r = xpu::sparse_adam(
+          dev_ctx.x_context(), grad_data, mom1.template data<T>(),
+          mom2.template data<T>(), param.template data<T>(), beta1_pow_ptr,
+          beta2_pow_ptr, lr.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          param_out.template mutable_data<T>(ctx.GetPlace()), beta1, beta2,
+          epsilon, ori_rows, xpu_rows, row_numel, grad_merge.rows().size(),
+          lazy_mode);
+
+      PADDLE_ENFORCE_EQ(
+          r == xpu::Error_t::SUCCESS, true,
+          platform::errors::External("XPU API return wrong value[%d],", r));
+
+      if (!use_global_beta_pow) {
+        // update in cpu and then copy to xpu
+        if (beta1_pow.place() == platform::CPUPlace() &&
+            beta2_pow.place() == platform::CPUPlace()) {
+          const float* beta1_pow_p = beta1_pow.template data<float>();
+          beta1_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow_p[0];
+          const float* beta2_pow_p = beta2_pow.template data<float>();
+          beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow_p[0];
+        } else {
+          float* beta1_pow_out_p =
+              beta1_pow_out->mutable_data<float>(ctx.GetPlace());
+          float* beta2_pow_out_p =
+              beta2_pow_out->mutable_data<float>(ctx.GetPlace());
+          int r =
+              xpu::scale(dev_ctx.x_context(), beta1_pow_ptr, beta1_pow_out_p,
+                         beta1_pow.numel(), false, beta1, 0.0f);
+          PADDLE_ENFORCE_EQ(
+              r, xpu::SUCCESS,
+              platform::errors::External(
+                  "XPU kernel scale occur error in adam error code ", r,
+                  XPUAPIErrorMsg[r]));
+          r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p,
+                         beta2_pow.numel(), false, beta2, 0.0f);
+          PADDLE_ENFORCE_EQ(
+              r, xpu::SUCCESS,
+              platform::errors::External(
+                  "XPU kernel scale occur error in adam error code ", r,
+                  XPUAPIErrorMsg[r]));
+        }
       }
+      xpu_wait(dev_ctx.x_context()->xpu_stream);
     } else {
       PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
                                   "Variable type not supported by adam_op"));
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index 6d98522d752196690a110922a3c41c0bf60c7476..9a3eaa66caa8e870f2692c67aea29535dbd7492a 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/math/squared_l2_norm.h"
-#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index 6e6c826a22892dad86aad907ca360bd3b8236d03..0aa39c9af5c1723dd6b99ec8598762e5dd9d7a98 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index d40d14435a5fd016a9ab5aaeb0436f13654a510b..7ad3335009b06056fb624ef97305ac549b14035f 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -72,7 +72,8 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
   }
 };
 
-template <template <typename T> typename CompareFunctor, typename T>
+template <template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
 struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
   void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
                   const Tensor& rhs, Tensor* mask) {
@@ -81,7 +82,7 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     paddle::operators::LaunchSameDimsElementwiseCudaKernel<
         ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs,
-                                              CompareFunctor<int64_t>());
+                                              CompareFunctor<int64_t, T>());
   }
 };
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index 2b392ae74cc82dfad7bf2821b8610ec29882e754..ab95dbc763a5e4e762c1cea6b04e579d2c8b316b 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -112,12 +112,13 @@ void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
   }
 }
 
-template <typename DeviceContext, template <typename T> typename CompareFunctor,
+template <typename DeviceContext,
+          template <typename InT, typename OutT> typename CompareFunctor,
           typename T>
 struct GetMask {
   void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
                   const Tensor& rhs, Tensor* mask) {
-    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t>, T>(lhs, rhs, mask);
+    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
   }
 };
 
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
index dc2d3aa73ba601362b071cf5aa23edd97cbc70b0..cac9767dc2948fb1c272f20a5593dc4d3a499cc0 100644
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/eigen_ext.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/platform/complex_test.cc b/paddle/fluid/platform/complex_test.cc
index 4d13161e94faf910829fd93543e6c18990ea7813..05171495a88320c36a87d09de1551759ec4bdfce 100644
--- a/paddle/fluid/platform/complex_test.cc
+++ b/paddle/fluid/platform/complex_test.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/platform/complex.h"
 #include <complex>
-#include "paddle/fluid/platform/eigen_ext.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
@@ -267,56 +267,56 @@ TEST(complex, print) {
 TEST(complex, isinf) {
   // *********** complex<float> *************
   complex<float> a;
-  a.real = float(INFINITY);
+  a.real = static_cast<float>(INFINITY);
   EXPECT_EQ(std::isinf(a), true);
-  a.imag = float(INFINITY);
+  a.imag = static_cast<float>(INFINITY);
   EXPECT_EQ(std::isinf(a), true);
 
-  complex<float> b = float(INFINITY);
+  complex<float> b = static_cast<float>(INFINITY);
   EXPECT_EQ(std::isinf(b), true);
 
-  complex<float> c(float(INFINITY), 0);
+  complex<float> c(static_cast<float>(INFINITY), 0);
   EXPECT_EQ(std::isinf(c), true);
 
   // *********** complex<double> *************
   complex<double> a1;
-  a1.real = double(INFINITY);
+  a1.real = static_cast<double>(INFINITY);
   EXPECT_EQ(std::isinf(a1), true);
-  a1.imag = double(INFINITY);
+  a1.imag = static_cast<double>(INFINITY);
   EXPECT_EQ(std::isinf(a1), true);
 
-  complex<double> b1 = double(INFINITY);
+  complex<double> b1 = static_cast<double>(INFINITY);
   EXPECT_EQ(std::isinf(b1), true);
 
-  complex<double> c1(double(INFINITY), 0);
+  complex<double> c1(static_cast<double>(INFINITY), 0);
   EXPECT_EQ(std::isinf(c1), true);
 }
 
 TEST(complex, isnan) {
   // *********** complex<float> *************
   complex<float> a;
-  a.real = float(NAN);
+  a.real = static_cast<float>(NAN);
   EXPECT_EQ(std::isnan(a), true);
-  a.imag = float(NAN);
+  a.imag = static_cast<float>(NAN);
   EXPECT_EQ(std::isnan(a), true);
 
-  complex<float> b = float(NAN);
+  complex<float> b = static_cast<float>(NAN);
   EXPECT_EQ(std::isnan(b), true);
 
-  complex<float> c(float(NAN), 0);
+  complex<float> c(static_cast<float>(NAN), 0);
   EXPECT_EQ(std::isnan(c), true);
 
   // *********** complex<double> *************
   complex<double> a1;
-  a1.real = double(NAN);
+  a1.real = static_cast<double>(NAN);
   EXPECT_EQ(std::isnan(a1), true);
-  a1.imag = double(NAN);
+  a1.imag = static_cast<double>(NAN);
   EXPECT_EQ(std::isnan(a1), true);
 
-  complex<double> b1 = double(NAN);
+  complex<double> b1 = static_cast<double>(NAN);
   EXPECT_EQ(std::isnan(b1), true);
 
-  complex<double> c1(double(NAN), 0);
+  complex<double> c1(static_cast<double>(NAN), 0);
   EXPECT_EQ(std::isnan(c1), true);
 }
 
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
index b46d1b7b271d78fd436682fa2a5ffae974e61326..8f1dea1044677b3f1f49a5220b1d7a890e54ecbf 100644
--- a/paddle/fluid/platform/complex_test.cu
+++ b/paddle/fluid/platform/complex_test.cu
@@ -23,8 +23,8 @@
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 namespace paddle {
@@ -303,59 +303,59 @@ TEST(complex, print) {
 TEST(complex, isinf) {
   // *********** complex<float> *************
   complex<float> a;
-  a.real = float(INFINITY);
+  a.real = static_cast<float>(INFINITY);
   EXPECT_EQ(std::isinf(a), true);
-  a.imag = float(INFINITY);
+  a.imag = static_cast<float>(INFINITY);
   EXPECT_EQ(std::isinf(a), true);
 
-  complex<float> b = float(INFINITY);
+  complex<float> b = static_cast<float>(INFINITY);
   EXPECT_EQ(std::isinf(b), true);
 
-  complex<float> c(float(INFINITY), 0);
+  complex<float> c(static_cast<float>(INFINITY), 0);
   EXPECT_EQ(std::isinf(c), true);
 
   // *********** complex<double> *************
   complex<double> a1;
-  a1.real = double(INFINITY);
+  a1.real = static_cast<double>(INFINITY);
   EXPECT_EQ(std::isinf(a1), true);
-  a1.imag = double(INFINITY);
+  a1.imag = static_cast<double>(INFINITY);
   EXPECT_EQ(std::isinf(a1), true);
 
-  complex<double> b1 = double(INFINITY);
+  complex<double> b1 = static_cast<double>(INFINITY);
   EXPECT_EQ(std::isinf(b1), true);
 
-  complex<double> c1(double(INFINITY), 0);
+  complex<double> c1(static_cast<double>(INFINITY), 0);
   EXPECT_EQ(std::isinf(c1), true);
 }
 
 TEST(complex, isnan) {
   // *********** complex<float> *************
   complex<float> a;
-  a.real = float(NAN);
+  a.real = static_cast<float>(NAN);
   EXPECT_EQ(std::isnan(a), true);
-  a.imag = float(NAN);
+  a.imag = static_cast<float>(NAN);
   EXPECT_EQ(std::isnan(a), true);
 
-  complex<float> b = float(NAN);
+  complex<float> b = static_cast<float>(NAN);
   EXPECT_EQ(std::isnan(b), true);
 
-  complex<float> c(float(NAN), 0);
+  complex<float> c(static_cast<float>(NAN), 0);
   EXPECT_EQ(std::isnan(c), true);
 
   // *********** complex<double> *************
   complex<double> a1;
-  a1.real = double(NAN);
+  a1.real = static_cast<double>(NAN);
   EXPECT_EQ(std::isnan(a1), true);
-  a1.imag = double(NAN);
+  a1.imag = static_cast<double>(NAN);
   EXPECT_EQ(std::isnan(a1), true);
 
-  complex<double> b1 = double(NAN);
+  complex<double> b1 = static_cast<double>(NAN);
   EXPECT_EQ(std::isnan(b1), true);
 
-  complex<double> c1(double(NAN), 0);
+  complex<double> c1(static_cast<double>(NAN), 0);
   EXPECT_EQ(std::isnan(c1), true);
 }
 
 }  // namespace platform
 }  // namespace paddle
-#endif
\ No newline at end of file
+#endif
diff --git a/paddle/fluid/platform/device/npu/dynload/hccl.h b/paddle/fluid/platform/device/npu/dynload/hccl.h
index a56180ce2d4ca56b7ffbce9b8c384a1cd72d21d4..2c251ceb5491714ece2dfddf35a3914f7ff89257 100644
--- a/paddle/fluid/platform/device/npu/dynload/hccl.h
+++ b/paddle/fluid/platform/device/npu/dynload/hccl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 #define HCOM_GROUP_PREFIX "HCOM_GROUP_"
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 27b900198bc82e5629cab29e8d325e31ff69d26d..6ffeaf101feca795f8a330b72206dffa2d68904c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/ipu/ipu_backend.h"
 #endif
 #include "glog/logging.h"
+#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -841,15 +842,6 @@ unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
   return num_entries;
 }
 
-// TODO(jczaja): Replace with C++20 equivalents when applicable
-#ifdef _WIN32
-#define likely(expr) (expr)
-#define unlikely(expr) (expr)
-#else
-#define likely(expr) (__builtin_expect(!!(expr), 1))
-#define unlikely(expr) (__builtin_expect(!!(expr), 0))
-#endif
-
 MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
     const std::string& name) const {
   BlobMap* pMap = p_blobmap_.get();
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 7695f8b58810dbec358741ea9b8320fa1b9fbebc..49391a65b185b45b35edac5d6217a2e4095b4c4a 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)
 
 list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
 
@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
 elseif (WITH_ASCEND_CL)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
 else()
-  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
 endif()
 if (WITH_MKLML)
-    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
 endif()
 
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader)
+cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
 add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader)
+  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
   target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
index 41648c32fe6f98bb0b78ea7891065e5586f70463..aee0f2c9e39fbd9a646e2e2fb322eb822479d7b5 100644
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 17ae4d5bf03d7b20862b6d384719b25d5fc69e90..eb2c019af9b6461ccd4b7f7499af057baaceb8a1 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -20,16 +20,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cublas.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag cublas_dso_flag;
-extern void *cublas_dso_handle;
-
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cublas routine
@@ -37,19 +33,8 @@ extern void *cublas_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
-      using cublas_func =                                                    \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
-      std::call_once(cublas_dso_flag, []() {                                 \
-        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<cublas_func>(p_##__name)(args...);             \
-    }                                                                        \
-  };                                                                         \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
@@ -99,7 +84,7 @@ extern void *cublas_dso_handle;
   __macro(cublasSgetrsBatched);           \
   __macro(cublasDgetrsBatched);
 
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
@@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
   __macro(cublasZgemmStridedBatched);        \
   __macro(cublasHgemmStridedBatched);
 
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
 // APIs available after CUDA 9.0
@@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
   __macro(cublasSetMathMode);                \
   __macro(cublasGetMathMode);
 
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
 // APIs available after CUDA 9.1
@@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
   __macro(cublasGemmBatchedEx);              \
   __macro(cublasGemmStridedBatchedEx);
 
-CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublasLt.cc b/paddle/fluid/platform/dynload/cublasLt.cc
index 78f952985c8117c6832be0af2c657dc6a9502d41..891df24034b877a55e1c2d07ac30ea4ca7fb32e9 100644
--- a/paddle/fluid/platform/dynload/cublasLt.cc
+++ b/paddle/fluid/platform/dynload/cublasLt.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cublasLt_dso_flag;
-void *cublasLt_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h
index f4e04c94e04c615dce496ff0c95064b6326880f7..aa605be1d698e6ba8f6928c26ae682b11c871444 100644
--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cublasLt.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag cublasLt_dso_flag;
-extern void *cublasLt_dso_handle;
-
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cublasLt routine
@@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cublasLt_func =                                                 \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(cublasLt_dso_flag, []() {                              \
-        cublasLt_dso_handle =                                               \
-            paddle::platform::dynload::GetCublasLtDsoHandle();              \
-      });                                                                   \
-      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
-      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)   \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 // APIs available after CUDA 10.1
@@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle;
   __macro(cublasLtMatrixTransformDescDestroy); \
   __macro(cublasLtMatrixTransformDescSetAttribute);
 
-CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif
 
-#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc
index 6110e6b6ba93fa108e6b92d18af7bda0ad383840..f4c814979e5c24be598ac220fd7791562ffdc6f7 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
+#include "paddle/pten/backends/dynload/cuda_driver.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cuda_dso_flag;
-void* cuda_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 #if CUDA_VERSION >= 10020
@@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
 #endif
 CUDA_ROUTINE_EACH(DEFINE_WRAP);
 
-bool HasCUDADriver() {
-  std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
-  return cuda_dso_handle != nullptr;
-}
+bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index b5212c64cd14d40b7600c9bae623888d999d4d19..4d48e9f778ed0e0a44703b4944ce30a3b4ee2e31 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <cuda.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cuda_driver.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag cuda_dso_flag;
-extern void* cuda_dso_handle;
 extern bool HasCUDADriver();
 
-#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
-      using cuda_func = decltype(&::__name);                             \
-      std::call_once(cuda_dso_flag, []() {                               \
-        cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(cuda_dso_handle, #__name);         \
-      return reinterpret_cast<cuda_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
 
 /**
  * include all needed cuda driver functions
@@ -72,12 +59,12 @@ extern bool HasCUDADriver();
   __macro(cuMemRelease);                  \
   __macro(cuMemAddressFree)
 
-CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+CUDA_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
 #endif
 
-CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+CUDA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
 
-#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 366762401c741e570204f5c9d146343a3d60aa33..1d89f526dc18ceb03cf9880f1042cbe3126f3f63 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
@@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
 #endif
 
-bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag,
-                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
-  return cudnn_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cudnn_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load cudnn shared library. Cannot invoke method %s.",
-          fn_name));
-}
+bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 3420c38fe963956813ce2cd18ba5c63d366d217c..a46c7303cfc53a6cdb8240dbad3a36fe5b4952b0 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -18,32 +18,17 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag cudnn_dso_flag;
-extern void* cudnn_dso_handle;
 extern bool HasCUDNN();
 
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using cudnn_func = decltype(&::__name);                              \
-      std::call_once(cudnn_dso_flag, []() {                                \
-        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-      });                                                                  \
-      EnforceCUDNNLoaded(#__name);                                         \
-      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
 
 /**
  * include all needed cudnn functions in HPPL
@@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnGetActivationDescriptor);                   \
   __macro(cudnnDestroyActivationDescriptor);               \
   __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
@@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(cudnnGetConvolutionForwardAlgorithm);          \
   __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
   __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(
+    PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 7001
@@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
   __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
-CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 7201
@@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(cudnnRNNBackwardDataEx);                   \
   __macro(cudnnRNNBackwardWeightsEx);                \
   __macro(cudnnRNNForwardInferenceEx);
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 7401
@@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
   __macro(cudnnBatchNormalizationBackwardEx);                        \
   __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 8000
@@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
   __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
   __macro(cudnnMakeFusedOpsPlan);
-CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R8(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/cufft.cc b/paddle/fluid/platform/dynload/cufft.cc
index a125fb7226050ba810d2740416cefcdd6eb9b0e7..3f3534112e47db51b9ac606c31d70122780cb3eb 100644
--- a/paddle/fluid/platform/dynload/cufft.cc
+++ b/paddle/fluid/platform/dynload/cufft.cc
@@ -13,31 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cufft.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cufft.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cufft_dso_flag;
-void* cufft_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
 
-bool HasCUFFT() {
-  std::call_once(cufft_dso_flag,
-                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
-  return cufft_dso_handle != nullptr;
-}
-
-void EnforceCUFFTLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cufft_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load cufft shared library. Cannot invoke method %s.",
-          fn_name));
-}
+bool HasCUFFT() { return pten::dynload::HasCUFFT(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cufft.h b/paddle/fluid/platform/dynload/cufft.h
index ef924d7b5ee865cce76a3ae872d2cbb36466b5be..4d95edeefc05345384c5dd486f0765eb33fbf367 100644
--- a/paddle/fluid/platform/dynload/cufft.h
+++ b/paddle/fluid/platform/dynload/cufft.h
@@ -19,32 +19,17 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cufft.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag cufft_dso_flag;
-extern void* cufft_dso_handle;
 extern bool HasCUFFT();
 
-extern void EnforceCUFFTLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using cufft_func = decltype(&::__name);                              \
-      std::call_once(cufft_dso_flag, []() {                                \
-        cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \
-      });                                                                  \
-      EnforceCUFFTLoaded(#__name);                                         \
-      static void* p_##__name = dlsym(cufft_dso_handle, #__name);          \
-      return reinterpret_cast<cufft_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
 
 /**
  * include all needed cufft functions in HPPL
@@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
   __macro(cufftXtExecDescriptor);        \
   __macro(cufftXtSetWorkAreaPolicy);
 
-CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+CUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc
index d8381580c90d4ef9cd1bf5adbce7a733f81c91e0..5e2c8630617b532bc4f2d8076f4cbe10bcf550f1 100644
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
@@ -20,9 +20,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cupti_dso_flag;
-void *cupti_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CUPTI_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index 49bfdce4d38bbdfdcdd3e3ea64684dd4179e6f63..c6d844cee9d02d111835c950cecd69792dca0af4 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <cupti.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cupti.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag cupti_dso_flag;
-extern void *cupti_dso_handle;
-
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cupti routine
@@ -36,18 +32,8 @@ extern void *cupti_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
-      using cuptiFunc = decltype(&::__name);                               \
-      std::call_once(cupti_dso_flag, []() {                                \
-        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
-      });                                                                  \
-      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)               \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define CUPTI_ROUTINE_EACH(__macro)           \
diff --git a/paddle/fluid/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc
index ce83ebc84fe7bbd91e1c7e46f98a7f0d8b4a7394..9a6686515ea2b2d0cf0ef477f32839bbb3c6ccd5 100644
--- a/paddle/fluid/platform/dynload/curand.cc
+++ b/paddle/fluid/platform/dynload/curand.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag curand_dso_flag;
-void *curand_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 7a160664bc2e8973951892c23981fdd746b2123f..89b08bf7097cd94f8386f8cdd2eb72849d36f4c3 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -16,27 +16,14 @@ limitations under the License. */
 #include <curand.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/curand.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag curand_dso_flag;
-extern void *curand_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    curandStatus_t operator()(Args... args) {                                \
-      using curandFunc = decltype(&::__name);                                \
-      std::call_once(curand_dso_flag, []() {                                 \
-        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define CURAND_RAND_ROUTINE_EACH(__macro)      \
@@ -48,7 +35,7 @@ extern void *curand_dso_handle;
   __macro(curandGenerateNormal);               \
   __macro(curandDestroyGenerator);
 
-CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+CURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cusolver.cc b/paddle/fluid/platform/dynload/cusolver.cc
index d4163e9a7431b086cee4e99dd4c07e42d7d8c0b7..bf8394f3f02ca22945ddc94032863dd5845f76f4 100644
--- a/paddle/fluid/platform/dynload/cusolver.cc
+++ b/paddle/fluid/platform/dynload/cusolver.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cusolver_dso_flag;
-void *cusolver_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index 63661a93cfd85dafe851ceb0820ad4fcccf63d0f..e4b29c6377385889336dda501afbca84bb4aeec8 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -17,28 +17,14 @@ limitations under the License. */
 #include <cusolverDn.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cusolver_dso_flag;
-extern void *cusolver_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                   \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    cusolverStatus_t operator()(Args... args) {                      \
-      using cusolverFunc = decltype(&::__name);                      \
-      std::call_once(cusolver_dso_flag, []() {                       \
-        cusolver_dso_handle =                                        \
-            paddle::platform::dynload::GetCusolverDsoHandle();       \
-      });                                                            \
-      static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
-      return reinterpret_cast<cusolverFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)   \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define CUSOLVER_ROUTINE_EACH(__macro)  \
@@ -62,7 +48,7 @@ extern void *cusolver_dso_handle;
   __macro(cusolverDnCheevd);            \
   __macro(cusolverDnZheevd);
 
-CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
+CUSOLVER_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
 
 #if CUDA_VERSION >= 9020
 #define CUSOLVER_ROUTINE_EACH_R1(__macro) \
@@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
   __macro(cusolverDnCungqr);              \
   __macro(cusolverDnZungqr);
 
-CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
 
 #if CUDA_VERSION >= 9020
@@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
   __macro(cusolverDnDsyevj);              \
   __macro(cusolverDnDestroySyevjInfo);
 
-CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
 
-#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
index be67f121d68ed9654db63ef5402c88ef09223af2..ea7c502e3e681c94cea76b7c3176c7de29adfc2b 100644
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cusparse_dso_flag;
-void *cusparse_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 #ifdef CUSPARSE_ROUTINE_EACH
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
index fc842a3377b636bb7c2b1fff600d0842b2e07108..596d2b51aec3e4527ed4c4fce35a36a75c9c1b85 100644
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -17,28 +17,14 @@ limitations under the License. */
 #include <cusparse.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cusparse.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cusparse_dso_flag;
-extern void *cusparse_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                   \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    cusparseStatus_t operator()(Args... args) {                      \
-      using cusparseFunc = decltype(&::__name);                      \
-      std::call_once(cusparse_dso_flag, []() {                       \
-        cusparse_dso_handle =                                        \
-            paddle::platform::dynload::GetCusparseDsoHandle();       \
-      });                                                            \
-      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
-      return reinterpret_cast<cusparseFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)   \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_CUDA)
@@ -54,7 +40,7 @@ extern void *cusparse_dso_handle;
   __macro(cusparseSetMatType);         \
   __macro(cusparseSetMatIndexBase);
 
-CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
+CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
 
 // APIs available after CUDA 11.2
 #if CUDA_VERSION >= 11020
@@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
   __macro(cusparseSparseToDense_bufferSize); \
   __macro(cusparseSparseToDense);
 
-CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 
 // APIs available after CUDA 11.3
 #if CUDA_VERSION >= 11030
@@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
   __macro(cusparseSDDMM_preprocess);      \
   __macro(cusparseSDDMM);
 
-CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif
 #endif
 #endif
 #endif
 
-#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 905f1aea887ab8ef4d971f7697d366dd8c89b8d7..caefb5a4e2276c19acf1317a7f590e43e8d805fb 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -15,556 +15,61 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
-
 #include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#if defined(_WIN32)
-#include <windows.h>
-#endif
-
-DEFINE_string(cudnn_dir, "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(
-    cuda_dir, "",
-    "Specify path for loading cuda library, such as libcublas, libcublasLt "
-    "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
-    "If default, dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(nccl_dir, "",
-              "Specify path for loading nccl library, such as libnccl.so. "
-              "For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(hccl_dir, "",
-              "Specify path for loading hccl library, such as libhccl.so. "
-              "For instance, "
-              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
-              "default, "
-              "dlopen will search hccl from LD_LIBRARY_PATH");
-
-DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
-
-DEFINE_string(
-    tensorrt_dir, "",
-    "Specify path for loading tensorrt library, such as libnvinfer.so.");
-
-DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
-
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
-DEFINE_string(mkl_dir, "",
-              "Specify path for loading libmkl_rt.so. "
-              "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/."
-              "If default, "
-              "dlopen will search mkl from LD_LIBRARY_PATH");
-
-DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
-
-#ifdef PADDLE_WITH_HIP
-
-DEFINE_string(miopen_dir, "",
-              "Specify path for loading libMIOpen.so. For instance, "
-              "/opt/rocm/miopen/lib. If empty [default], dlopen "
-              "will search miopen from LD_LIBRARY_PATH");
-
-DEFINE_string(rocm_dir, "",
-              "Specify path for loading rocm library, such as librocblas, "
-              "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
-              "If default, dlopen will search rocm from LD_LIBRARY_PATH");
-
-DEFINE_string(rccl_dir, "",
-              "Specify path for loading rccl library, such as librccl.so. "
-              "For instance, /opt/rocm/rccl/lib. If default, "
-              "dlopen will search rccl from LD_LIBRARY_PATH");
-#endif
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-struct PathNode {
-  PathNode() {}
-  std::string path = "";
-};
-
-static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
-
-// NOTE: In order to adapt to the default installation path of cuda
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
-#else
-static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";
-#endif
-
-static PathNode s_py_site_pkg_path;
-
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
-static constexpr char* win_cublas_lib =
-    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
-#if CUDA_VERSION >= 11000
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
-#else
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
-#endif  // CUDA_VERSION
-#endif
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline std::vector<std::string> split(
-    const std::string& str, const std::string separator = " ") {
-  std::vector<std::string> str_list;
-  std::string::size_type firstPos;
-  firstPos = str.find_first_not_of(separator, 0);
-  std::string::size_type lastPos;
-  lastPos = str.find_first_of(separator, firstPos);
-  while (std::string::npos != firstPos && std::string::npos != lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-    firstPos = str.find_first_not_of(separator, lastPos);
-    lastPos = str.find_first_of(separator, firstPos);
-  }
-  if (std::string::npos == lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-  }
-  return str_list;
-}
-
 void SetPaddleLibPath(const std::string& py_site_pkg_path) {
-  s_py_site_pkg_path.path = py_site_pkg_path;
-  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
-}
-
-static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
-                                                 const std::string& dso_name,
-                                                 int dynload_flags) {
-  void* dso_handle = nullptr;
-  if (!spec_path.empty()) {
-    // search xxx.so from custom path
-    VLOG(3) << "Try to find library: " << dso_name
-            << " from specific path: " << spec_path;
-    std::string dso_path = join(spec_path, dso_name);
-    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  }
-  return dso_handle;
-}
-
-static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
-                                                int dynload_flags) {
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  // and /usr/local/lib path
-  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-
-// TODO(chenweihang): This path is used to search which libs?
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
-  }
-#endif
-
-  return dso_handle;
+  pten::dynload::SetPaddleLibPath(py_site_pkg_path);
 }
 
-/*
- * We define three priorities for dynamic library search:
- *
- * First: Search for the path specified by the user
- * Second: Search the system default path
- * Third: Search for a special path corresponding to
- *        a specific library to adapt to changes and easy to expand.
- */
-
-static inline void* GetDsoHandleFromSearchPath(
-    const std::string& config_path, const std::string& dso_name,
-    bool throw_on_error = true,
-    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
-    const std::string& warning_msg = std::string()) {
-#if !defined(_WIN32)
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-#else
-  int dynload_flags = 0;
-#endif  // !_WIN32
-  std::vector<std::string> dso_names = split(dso_name, ";");
-  void* dso_handle = nullptr;
-  for (auto dso : dso_names) {
-    // 1. search in user config path by FLAGS
-    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
-    // 2. search in system default path
-    if (nullptr == dso_handle) {
-      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
-    }
-    // 3. search in extra paths
-    if (nullptr == dso_handle) {
-      for (auto path : extra_paths) {
-        VLOG(3) << "extra_paths: " << path;
-        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
-      }
-    }
-    if (nullptr != dso_handle) break;
-  }
+void* GetCublasDsoHandle() { return pten::dynload::GetCublasDsoHandle(); }
 
-  // 4. [If Failed for All dso_names] logging warning if exists
-  if (nullptr == dso_handle && !warning_msg.empty()) {
-    LOG(WARNING) << warning_msg;
-  }
+void* GetCublasLtDsoHandle() { return pten::dynload::GetCublasLtDsoHandle(); }
 
-  // 5. [If Failed for All dso_names] logging or throw error info
-  if (nullptr == dso_handle) {
-    auto error_msg =
-        "The third-party dynamic library (%s) that Paddle depends on is not "
-        "configured correctly. (error code is %s)\n"
-        "  Suggestions:\n"
-        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
-        "is installed correctly and its version is matched with paddlepaddle "
-        "you installed.\n"
-        "  2. Configure third-party dynamic library environment variables as "
-        "follows:\n"
-        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
-        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
-        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
-        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
-        "impossible unless System Integrity Protection (SIP) is disabled.]";
-#if !defined(_WIN32)
-    auto errorno = dlerror();
-#else
-    auto errorno = GetLastError();
-#endif  // !_WIN32
-    if (throw_on_error) {
-      // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(
-          platform::errors::PreconditionNotMet(error_msg, dso_name, errorno));
-    } else {
-      LOG(WARNING) << string::Sprintf(error_msg, dso_name, errorno);
-    }
-  }
+void* GetCUDNNDsoHandle() { return pten::dynload::GetCUDNNDsoHandle(); }
 
-  return dso_handle;
-}
+void* GetCUPTIDsoHandle() { return pten::dynload::GetCUPTIDsoHandle(); }
 
-void* GetCublasDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
-                                    {cuda_lib_path});
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
-#endif
-}
-
-void* GetCublasLtDsoHandle() {
-// APIs available after CUDA 10.1
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
-#else
-  std::string warning_msg(
-      "Your CUDA_VERSION less 10.1, not support CublasLt. "
-      "If you want to use CublasLt, please upgrade CUDA and rebuild "
-      "PaddlePaddle.");
-  return nullptr;
-#endif
-}
-
-void* GetCUDNNDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  std::string mac_warn_meg(
-      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-      "For instance, sudo tar -xzf "
-      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-      "chmod a+r /usr/local/cuda/include/cudnn.h "
-      "/usr/local/cuda/lib/libcudnn*");
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false,
-                                    {}, mac_warn_meg);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  std::string win_warn_meg(
-      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
-      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
-      "NVIDIA's official website, \n"
-      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
-      "Toolkit\\CUDA\\v10.0\n"
-      "You should do this according to your CUDA installation directory and "
-      "CUDNN version.");
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
-                                    {cuda_lib_path}, win_warn_meg);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
-                                    {cuda_lib_path});
-#endif
-}
-
-void* GetCUPTIDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false,
-                                    {cupti_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.so", false,
-                                    {cupti_lib_path});
-#endif
-}
-
-void* GetCurandDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
-                                    {cuda_lib_path});
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
-#endif
-}
+void* GetCurandDsoHandle() { return pten::dynload::GetCurandDsoHandle(); }
 
 #ifdef PADDLE_WITH_HIP
-void* GetROCFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
-#endif
-}
+void* GetROCFFTDsoHandle() { return pten::dynload::GetROCFFTDsoHandle(); }
 #endif
 
-void* GetNvjpegDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
-#endif
-}
+void* GetNvjpegDsoHandle() { return pten::dynload::GetNvjpegDsoHandle(); }
 
-void* GetCusolverDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusolver_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
-#endif
-}
+void* GetCusolverDsoHandle() { return pten::dynload::GetCusolverDsoHandle(); }
 
-void* GetCusparseDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusparse_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
-#endif
-}
+void* GetCusparseDsoHandle() { return pten::dynload::GetCusparseDsoHandle(); }
 
-void* GetNVRTCDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
-#endif
-}
+void* GetNVRTCDsoHandle() { return pten::dynload::GetNVRTCDsoHandle(); }
 
-void* GetCUDADsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#elif defined(_WIN32)
-  char system32_dir[MAX_PATH];
-  GetSystemDirectory(system32_dir, MAX_PATH);
-  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
-#endif
-}
+void* GetCUDADsoHandle() { return pten::dynload::GetCUDADsoHandle(); }
 
-void* GetWarpCTCDsoHandle() {
-  std::string warpctc_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warpctc_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
-#else
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
-#endif
-}
+void* GetWarpCTCDsoHandle() { return pten::dynload::GetWarpCTCDsoHandle(); }
 
-void* GetNCCLDsoHandle() {
-#ifdef PADDLE_WITH_HIP
-  std::string warning_msg(
-      "You may need to install 'rccl' from ROCM official website: "
-      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
-      "Installation-Guide.html before install PaddlePaddle.");
-#else
-  std::string warning_msg(
-      "You may need to install 'nccl2' from NVIDIA official website: "
-      "https://developer.nvidia.com/nccl/nccl-download"
-      "before install PaddlePaddle.");
-#endif
+void* GetNCCLDsoHandle() { return pten::dynload::GetNCCLDsoHandle(); }
+void* GetHCCLDsoHandle() { return pten::dynload::GetHCCLDsoHandle(); }
 
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
-                                    warning_msg);
-#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true, {},
-                                    warning_msg);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
-                                    warning_msg);
-#endif
-}
-void* GetHCCLDsoHandle() {
-  std::string warning_msg(
-      "You may need to install 'hccl2' from Huawei official website: "
-      "before install PaddlePaddle.");
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
-                                    warning_msg);
-#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+void* GetTensorRtDsoHandle() { return pten::dynload::GetTensorRtDsoHandle(); }
 
-#elif defined(PADDLE_WITH_ASCEND_CL)
-  return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
-                                    warning_msg);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
-                                    warning_msg);
-#endif
-}
+void* GetMKLMLDsoHandle() { return pten::dynload::GetMKLMLDsoHandle(); }
 
-void* GetTensorRtDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
-#endif
-}
-
-void* GetMKLMLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
-#endif
-}
-
-void* GetLAPACKDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
-#endif
-}
+void* GetLAPACKDsoHandle() { return pten::dynload::GetLAPACKDsoHandle(); }
 
 void* GetOpDsoHandle(const std::string& dso_name) {
-  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
+  return pten::dynload::GetOpDsoHandle(dso_name);
 }
 
-void* GetNvtxDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple."));
-#elif defined(_WIN32)
-  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows."));
-#elif !defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(
-      platform::errors::Unimplemented("Nvtx do not support without CUDA."));
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
-#endif
-}
+void* GetNvtxDsoHandle() { return pten::dynload::GetNvtxDsoHandle(); }
 
-void* GetCUFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cufft_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
-#endif
-}
+void* GetCUFFTDsoHandle() { return pten::dynload::GetCUFFTDsoHandle(); }
 
-void* GetMKLRTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
-#endif
-}
+void* GetMKLRTDsoHandle() { return pten::dynload::GetMKLRTDsoHandle(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/hipfft.cc b/paddle/fluid/platform/dynload/hipfft.cc
index 767d2161be9d8dea9ad98025b7f2e605e6a89395..0da4758e6d5575462e314c5ff2118842661c0d8c 100644
--- a/paddle/fluid/platform/dynload/hipfft.cc
+++ b/paddle/fluid/platform/dynload/hipfft.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag hipfft_dso_flag;
-void *hipfft_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/hipfft.h b/paddle/fluid/platform/dynload/hipfft.h
index 50c25935e41b7ed4d5b633a5f22120efb1d2efa2..356b6c48a64ee49ca61a60fa053c42ec29b005a7 100644
--- a/paddle/fluid/platform/dynload/hipfft.h
+++ b/paddle/fluid/platform/dynload/hipfft.h
@@ -17,8 +17,7 @@ limitations under the License. */
 
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/hipfft.h"
 
 namespace paddle {
 namespace platform {
@@ -26,18 +25,8 @@ namespace dynload {
 extern std::once_flag hipfft_dso_flag;
 extern void *hipfft_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using hipfftFunc = decltype(&::__name);                                \
-      std::call_once(hipfft_dso_flag, []() {                                 \
-        hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);           \
-      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define HIPFFT_FFT_ROUTINE_EACH(__macro) \
@@ -70,53 +59,8 @@ extern void *hipfft_dso_handle;
   __macro(hipfftGetVersion);             \
   __macro(hipfftGetProperty);
 
-HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+HIPFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
 
-inline const char *hipfftGetErrorString(hipfftResult_t status) {
-  switch (status) {
-    case HIPFFT_SUCCESS:
-      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
-    case HIPFFT_INVALID_PLAN:
-      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
-    case HIPFFT_ALLOC_FAILED:
-      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
-             "memory.";
-    case HIPFFT_INVALID_TYPE:
-      return "'HIPFFT_INVALID_TYPE'. No longer used.";
-    case HIPFFT_INVALID_VALUE:
-      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
-             "parameter.";
-    case HIPFFT_INTERNAL_ERROR:
-      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
-             "error.";
-    case HIPFFT_EXEC_FAILED:
-      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
-    case HIPFFT_SETUP_FAILED:
-      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
-    case HIPFFT_INVALID_SIZE:
-      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
-    case HIPFFT_UNALIGNED_DATA:
-      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
-    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
-      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
-    case HIPFFT_INVALID_DEVICE:
-      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
-             "GPU than plan creation.";
-    case HIPFFT_PARSE_ERROR:
-      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
-    case HIPFFT_NO_WORKSPACE:
-      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
-             "plan execution.";
-    case HIPFFT_NOT_IMPLEMENTED:
-      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
-             "functionality for parameters given.";
-    case HIPFFT_NOT_SUPPORTED:
-      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
-             "parameters given.";
-    default:
-      return "HIPFFT_STATUS_UNKNOWN_ERROR";
-  }
-}
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/hiprand.cc b/paddle/fluid/platform/dynload/hiprand.cc
index 4fb26d0f9c85a1f10410ac7e3337b6bb5efec116..4ad4eb8e41aba4ab0a78875a2a85333638049382 100644
--- a/paddle/fluid/platform/dynload/hiprand.cc
+++ b/paddle/fluid/platform/dynload/hiprand.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag hiprand_dso_flag;
-void *hiprand_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/hiprand.h b/paddle/fluid/platform/dynload/hiprand.h
index 496e70bb26db683104919a968636f713d7d69f07..4d175b00c70536fc32ce5293c8fa353cbf55e787 100644
--- a/paddle/fluid/platform/dynload/hiprand.h
+++ b/paddle/fluid/platform/dynload/hiprand.h
@@ -16,28 +16,15 @@ limitations under the License. */
 #include <hiprand.h>
 
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/hiprand.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag hiprand_dso_flag;
-extern void *hiprand_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                              \
-  struct DynLoad__##__name {                                                  \
-    template <typename... Args>                                               \
-    hiprandStatus_t operator()(Args... args) {                                \
-      using hiprandFunc = decltype(&::__name);                                \
-      std::call_once(hiprand_dso_flag, []() {                                 \
-        hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                     \
-      static void *p_##__name = dlsym(hiprand_dso_handle, #__name);           \
-      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);              \
-    }                                                                         \
-  };                                                                          \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
@@ -49,7 +36,7 @@ extern void *hiprand_dso_handle;
   __macro(hiprandGenerateNormal);               \
   __macro(hiprandDestroyGenerator);
 
-HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+HIPRAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc
index 86a39d08eaa520dd89108495e0d60d80d4a9471c..ce4f915a9a47997e270fc646038ef36359e01f9b 100644
--- a/paddle/fluid/platform/dynload/hiprtc.cc
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/hiprtc.h"
+#include "paddle/pten/backends/dynload/hiprtc.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag hiprtc_dso_flag;
-void* hiprtc_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
 
-bool HasNVRTC() {
-  std::call_once(hiprtc_dso_flag,
-                 []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
-  return hiprtc_dso_handle != nullptr;
-}
+bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h
index 4b376f1858f9436a609904089367677e38a4b403..f2bb56ace69eebce8d8ff7c35d009abdf6c1b75e 100644
--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -16,30 +16,17 @@ limitations under the License. */
 
 #include <hip/hiprtc.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/hiprtc.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag hiprtc_dso_flag;
-extern void* hiprtc_dso_handle;
 extern bool HasNVRTC();
 
-#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      using hiprtc_func = decltype(&::__name);                              \
-      std::call_once(hiprtc_dso_flag, []() {                                \
-        hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
-      });                                                                   \
-      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);          \
-      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
 
 /**
  * include all needed hiprtc functions
@@ -55,9 +42,9 @@ extern bool HasNVRTC();
   __macro(hiprtcGetProgramLog);      \
   __macro(hiprtcGetProgramLogSize)
 
-HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
+HIPRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
 
-#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/lapack.cc b/paddle/fluid/platform/dynload/lapack.cc
index eeebe240874f2897e7d56e684048451862581a78..5a21bb4d041d9b02897b81fd9af8fb58983a7838 100644
--- a/paddle/fluid/platform/dynload/lapack.cc
+++ b/paddle/fluid/platform/dynload/lapack.cc
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/lapack.h"
-#include <mutex>
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 LAPACK_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h
index ce24b98defbe99e519ac7ccf927758c0ef05fdd7..4a55237e3ac2b96886d9c710ed6e096a3c4194cc 100644
--- a/paddle/fluid/platform/dynload/lapack.h
+++ b/paddle/fluid/platform/dynload/lapack.h
@@ -16,122 +16,20 @@ limitations under the License. */
 
 #include <complex>
 #include <mutex>
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-// Note(zhouwei): because lapack doesn't provide appropriate header file.
-// should expose API statement yourself.
-
-// getrf_(For example)
-extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
-                        int *info);
-extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
-                        int *info);
-
-// evd
-extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
-                        int *lda, double *w, std::complex<double> *work,
-                        int *lwork, double *rwork, int *lrwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
-                        int *lda, float *w, std::complex<float> *work,
-                        int *lwork, float *rwork, int *lrwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
-                        double *w, double *work, int *lwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
-                        float *w, float *work, int *lwork, int *iwork,
-                        int *liwork, int *info);
-
-// geev
-extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
-                       double *wr, double *wi, double *vl, int *ldvl,
-                       double *vr, int *ldvr, double *work, int *lwork,
-                       int *info);
-extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
-                       float *wr, float *wi, float *vl, int *ldvl, float *vr,
-                       int *ldvr, float *work, int *lwork, int *info);
-extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
-                       std::complex<double> *a, int *lda,
-                       std::complex<double> *w, std::complex<double> *vl,
-                       int *ldvl, std::complex<double> *vr, int *ldvr,
-                       std::complex<double> *work, int *lwork, double *rwork,
-                       int *info);
-extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
-                       int *lda, std::complex<float> *w,
-                       std::complex<float> *vl, int *ldvl,
-                       std::complex<float> *vr, int *ldvr,
-                       std::complex<float> *work, int *lwork, float *rwork,
-                       int *info);
-
-// gels
-extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a,
-                       int *lda, double *b, int *ldb, double *work, int *lwork,
-                       int *info);
-extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a,
-                       int *lda, float *b, int *ldb, float *work, int *lwork,
-                       int *info);
-
-// gelsd
-extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, double *s, double *rcond,
-                        int *rank, double *work, int *lwork, int *iwork,
-                        int *info);
-extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, float *s, float *rcond, int *rank,
-                        float *work, int *lwork, int *iwork, int *info);
-
-// gelsy
-extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, int *jpvt, double *rcond,
-                        int *rank, double *work, int *lwork, int *info);
-extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, int *jpvt, float *rcond, int *rank,
-                        float *work, int *lwork, int *info);
-
-// gelss
-extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, double *s, double *rcond,
-                        int *rank, double *work, int *lwork, int *info);
-extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, float *s, float *rcond, int *rank,
-                        float *work, int *lwork, int *info);
-
-extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
-                        int *lda, std::complex<double> *b, int *ldb, int *info);
-extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
-                        int *lda, std::complex<float> *b, int *ldb, int *info);
-extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, int *info);
-extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
-                        float *b, int *ldb, int *info);
+#include "paddle/pten/backends/dynload/lapack.h"
+#include "paddle/pten/common/complex.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag lapack_dso_flag;
-extern void *lapack_dso_handle;
-
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load lapack routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                     \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using lapackFunc = decltype(&::__name);                                \
-      std::call_once(lapack_dso_flag, []() {                                 \
-        lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \
-      });                                                                    \
-      static void *p_##_name = dlsym(lapack_dso_handle, #__name);            \
-      return reinterpret_cast<lapackFunc>(p_##_name)(args...);               \
-    }                                                                        \
-  };                                                                         \
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc
index 1b4bdd2939feb9ad07cf998485d9ff385c1b7a8a..5390bdc6c56bb182d4e8ac02bb7efe4b541db34e 100644
--- a/paddle/fluid/platform/dynload/miopen.cc
+++ b/paddle/fluid/platform/dynload/miopen.cc
@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/miopen.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag miopen_dso_flag;
-void* miopen_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
@@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
 
-bool HasCUDNN() {
-  std::call_once(miopen_dso_flag,
-                 []() { miopen_dso_handle = GetCUDNNDsoHandle(); });
-  return miopen_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      miopen_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load miopen shared library. Cannot invoke method %s.",
-          fn_name));
-}
+bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 34845f24ff50dd6484962103894009c2b58c2eed..6f5532727103ec1cdb7fcf6dafc426942b2f61ed 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -18,66 +18,17 @@ limitations under the License. */
 #include <miopen/miopen.h>
 #include <miopen/version.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-#define MIOPEN_VERSION                                       \
-  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
-   MIOPEN_VERSION_PATCH)  // NOLINT
-
-// MIOPEN only support NCHW, just for compatibility with CUDNN API
-typedef enum {
-  MIOPEN_TENSOR_NCHW = 0,
-  MIOPEN_TENSOR_NHWC = 1,
-} miopenTensorFormat_t;
+#include "paddle/pten/backends/dynload/miopen.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag miopen_dso_flag;
-extern void* miopen_dso_handle;
 extern bool HasCUDNN();
 
-inline const char* miopenGetErrorString(miopenStatus_t status) {
-  switch (status) {
-    case miopenStatusSuccess:
-      return "MIOPEN_STATUS_SUCCESS";
-    case miopenStatusNotInitialized:
-      return "MIOPEN_STATUS_NOT_INITIALIZED";
-    case miopenStatusInvalidValue:
-      return "MIOPEN_STATUS_INVALID_VALUE";
-    case miopenStatusBadParm:
-      return "MIOPEN_STATUS_BAD_PARAM";
-    case miopenStatusAllocFailed:
-      return "MIOPEN_STATUS_ALLOC_FAILED";
-    case miopenStatusInternalError:
-      return "MIOPEN_STATUS_INTERNAL_ERROR";
-    case miopenStatusNotImplemented:
-      return "MIOPEN_STATUS_NOT_IMPLEMENTED";
-    case miopenStatusUnsupportedOp:
-      return "MIOPEN_STATUS_UNSUPPORTED_OP";
-    case miopenStatusUnknownError:
-    default:
-      return "MIOPEN_STATUS_UNKNOWN_ERROR";
-  }
-}
-
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      using miopen_func = decltype(&::__name);                              \
-      std::call_once(miopen_dso_flag, []() {                                \
-        miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-      });                                                                   \
-      EnforceCUDNNLoaded(#__name);                                          \
-      static void* p_##__name = dlsym(miopen_dso_handle, #__name);          \
-      return reinterpret_cast<miopen_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
 
 /**
  * include all needed miopen functions in HPPL
@@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenRNNForwardInference);                     \
   __macro(miopenGetTensorNumBytes);
 
-MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 #define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
   __macro(miopenConvolutionBackwardData);
-MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs available after R3:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
   __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs available after R4:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
   __macro(miopenBatchNormalizationForwardTraining);  \
   __macro(miopenBatchNormalizationForwardInference); \
   __macro(miopenBatchNormalizationBackward);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs in R5
 #define MIOPEN_DNN_ROUTINE_EACH_R5(__macro)  \
@@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
   __macro(miopenSetActivationDescriptor);    \
   __macro(miopenGetActivationDescriptor);    \
   __macro(miopenDestroyActivationDescriptor);
-MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R5(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs in R6
 #define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
 /*__macro(miopenSetRNNDescriptor_v6);*/
-MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R6(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 #define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
   __macro(miopenSetConvolutionGroupCount);  \
@@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
   __macro(miopenSetCTCLossDescriptor);      \
   __macro(miopenGetCTCLossWorkspaceSize);   \
   __macro(miopenCTCLoss);
-MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                    \
 /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
@@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx);                   \
 __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);          \
 __macro(cudnnBatchNormalizationBackwardEx);                          \
 __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc
index 020c02d9baadabc061c52e8d33b3bc8ebb74248f..ff475b2312c031bf8a05f42aead22584c6cd61f5 100644
--- a/paddle/fluid/platform/dynload/mklml.cc
+++ b/paddle/fluid/platform/dynload/mklml.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag mklml_dso_flag;
-void* mklml_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 MKLML_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 335b919f41c34b08fb7ea4398f2db96620058e4f..bd7d40eca3f2f2e6bba1c6741e92359e114f681f 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -17,36 +17,23 @@ limitations under the License. */
 #include <mkl.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/mklml.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag mklml_dso_flag;
-extern void *mklml_dso_handle;
-
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load mklml routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using mklmlFunc = decltype(&::__name);                               \
-      std::call_once(mklml_dso_flag, []() {                                \
-        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
-      });                                                                  \
-      static void *p_##_name = dlsym(mklml_dso_handle, #__name);           \
-      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
-    }                                                                      \
-  };                                                                       \
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
-#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
+  DYNAMIC_LOAD_MKLML_WRAP(__name)
 
 #define MKLML_ROUTINE_EACH(__macro) \
   __macro(cblas_sgemm);             \
@@ -111,7 +98,7 @@ extern void *mklml_dso_handle;
   __macro(MKL_Set_Num_Threads);     \
   __macro(MKL_Get_Max_Threads);
 
-MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
 
 #if !defined(_WIN32)
 DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h
index 423cd4d0a254c8ed795ed0cdbacfa6080d3fde55..c01d52b978094a0f0b32f054735f14f98b3d9e1b 100644
--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace platform {
@@ -32,18 +32,8 @@ extern void* mklrt_dso_handle;
  * (for each function) to dynamic load mkldfti routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                                    \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using mklrtFunc = decltype(&::__name);                               \
-      std::call_once(mklrt_dso_flag, []() {                                \
-        mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(mklrt_dso_handle, #__name);          \
-      return reinterpret_cast<mklrtFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
+#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 // mkl_dfti.h has a macro that shadows the function with the same name
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 8f917e4904ffe97a79537c7fca3fbe4e73ca5f66..7b0ea3bb7f3c1f2af26aacb70a72d413e80c2862 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag nccl_dso_flag;
-void *nccl_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index f0679b2bce11ee52526350b6653b39df511e785e..318948a1b29fa49f92e9b97c594c209b04d131df 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -16,28 +16,14 @@ limitations under the License. */
 #include <nccl.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nccl.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag nccl_dso_flag;
-extern void* nccl_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(&::__name);                             \
-      std::call_once(nccl_dso_flag, []() {                               \
-        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define NCCL_RAND_ROUTINE_EACH(__macro) \
@@ -57,30 +43,30 @@ extern void* nccl_dso_handle;
   __macro(ncclReduceScatter);           \
   __macro(ncclGetErrorString);
 
-NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 
 #if NCCL_VERSION_CODE >= 2212
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
-NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2304
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
-NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2703
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
   __macro(ncclSend);                               \
   __macro(ncclRecv);
-NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 21100
 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
   __macro(ncclRedOpCreatePreMulSum);                \
   __macro(ncclRedOpDestroy);
-NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/nvjpeg.cc b/paddle/fluid/platform/dynload/nvjpeg.cc
index eb0ad78b9b73cd38e2d6dd1f58433da41094dd3f..006efd29121c4eb12b924a350333d2b1695e17a6 100644
--- a/paddle/fluid/platform/dynload/nvjpeg.cc
+++ b/paddle/fluid/platform/dynload/nvjpeg.cc
@@ -15,9 +15,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag nvjpeg_dso_flag;
-void *nvjpeg_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h
index ae457b2958f5deff9d879b012a0e06108d86c830..0e137173e4a6c2cb0fc1073a0e85994ce0902713 100644
--- a/paddle/fluid/platform/dynload/nvjpeg.h
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
@@ -14,27 +14,14 @@ limitations under the License. */
 #include <nvjpeg.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nvjpeg.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nvjpeg_dso_flag;
-extern void *nvjpeg_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    nvjpegStatus_t operator()(Args... args) {                                \
-      using nvjpegFunc = decltype(&::__name);                                \
-      std::call_once(nvjpeg_dso_flag, []() {                                 \
-        nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name);           \
-      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define NVJPEG_RAND_ROUTINE_EACH(__macro) \
@@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle;
   __macro(nvjpegJpegStateDestroy);        \
   __macro(nvjpegDecode);
 
-NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc
index 74dfa5b3c22f8e846ff46b8baa2a66b6e4b8df8a..a07613b9bf4bf95ce03dd51b96c464c4315fb745 100644
--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/nvrtc.h"
+#include "paddle/pten/backends/dynload/nvrtc.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag nvrtc_dso_flag;
-void* nvrtc_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 NVRTC_ROUTINE_EACH(DEFINE_WRAP);
 
-bool HasNVRTC() {
-  std::call_once(nvrtc_dso_flag,
-                 []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
-  return nvrtc_dso_handle != nullptr;
-}
+bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index 720450d28b16f204cfc54b4e617adbe66997f539..b71d0b0231cf79477b321f71cf9ff7ed897fe9cc 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <nvrtc.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nvrtc.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag nvrtc_dso_flag;
-extern void* nvrtc_dso_handle;
 extern bool HasNVRTC();
 
-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using nvrtc_func = decltype(&::__name);                              \
-      std::call_once(nvrtc_dso_flag, []() {                                \
-        nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(nvrtc_dso_handle, #__name);          \
-      return reinterpret_cast<nvrtc_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
 
 /**
  * include all needed nvrtc functions
@@ -56,9 +43,9 @@ extern bool HasNVRTC();
   __macro(nvrtcGetProgramLog);      \
   __macro(nvrtcGetProgramLogSize)
 
-NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+NVRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
 
-#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nvtx.cc b/paddle/fluid/platform/dynload/nvtx.cc
index 372f8500e54dda0ee43c4129f9697c7e48d529a0..29683b2f2d4cf25c6dd74a137219672c28ae6316 100644
--- a/paddle/fluid/platform/dynload/nvtx.cc
+++ b/paddle/fluid/platform/dynload/nvtx.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag nvtx_dso_flag;
-void *nvtx_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 NVTX_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h
index b696bbf91816aa286f113cae70afe1f7683d24db..64782612379b8d56a09f07d9eea84a0d2441ee4c 100644
--- a/paddle/fluid/platform/dynload/nvtx.h
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -17,36 +17,23 @@ limitations under the License. */
 #include <nvToolsExt.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nvtx.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nvtx_dso_flag;
-extern void *nvtx_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    int operator()(Args... args) {                                       \
-      using nvtxFunc = decltype(&::__name);                              \
-      std::call_once(nvtx_dso_flag, []() {                               \
-        nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
-      });                                                                \
-      static void *p_##__name = dlsym(nvtx_dso_handle, #__name);         \
-      return reinterpret_cast<nvtxFunc>(p_##__name)(args...);            \
-    }                                                                    \
-  };                                                                     \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define NVTX_ROUTINE_EACH(__macro) \
   __macro(nvtxRangePushA);         \
   __macro(nvtxRangePop);
 
-NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
 
-#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc
index e19c22ba6d949f93c144833c042654f2a0bcbd8d..82838da685bf2d44bb60c1a082e025f4f9a30edb 100644
--- a/paddle/fluid/platform/dynload/rccl.cc
+++ b/paddle/fluid/platform/dynload/rccl.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag rccl_dso_flag;
-void *rccl_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h
index ac9ab657d5ee318786539bd3747bf71ebe39e10f..5512756028ef59941dcfa2e1a6dbf5505e65077e 100644
--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -16,28 +16,14 @@ limitations under the License. */
 #include <rccl.h>
 
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/rccl.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag rccl_dso_flag;
-extern void* rccl_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(&::__name);                             \
-      std::call_once(rccl_dso_flag, []() {                               \
-        rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(rccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define RCCL_RAND_ROUTINE_EACH(__macro) \
@@ -57,18 +43,18 @@ extern void* rccl_dso_handle;
   __macro(ncclReduceScatter);           \
   __macro(ncclGetErrorString);
 
-RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 
 #if NCCL_VERSION_CODE >= 2212
 #define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
-RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2703
 #define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
   __macro(ncclSend);                               \
   __macro(ncclRecv);
-RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/rocblas.cc b/paddle/fluid/platform/dynload/rocblas.cc
index ee774195363216255adbfb9715b9677c119b9c63..771989123c720a3430ceafdc23d78b1d5598ed93 100644
--- a/paddle/fluid/platform/dynload/rocblas.cc
+++ b/paddle/fluid/platform/dynload/rocblas.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag rocblas_dso_flag;
-void *rocblas_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h
index 45614f2209f880be3db3b034dbb4b9d7a4fe4310..a73bd61bda7aac538dfd4089eea48e6007ab59f6 100644
--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag rocblas_dso_flag;
-extern void *rocblas_dso_handle;
-
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cublas routine
@@ -36,18 +32,8 @@ extern void *rocblas_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                  \
-    template <typename... Args>                                               \
-    rocblas_status operator()(Args... args) {                                 \
-      using rocblas_func = decltype(&::__name);                               \
-      std::call_once(rocblas_dso_flag, []() {                                 \
-        rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-      });                                                                     \
-      static void *p_##__name = dlsym(rocblas_dso_handle, #__name);           \
-      return reinterpret_cast<rocblas_func>(p_##__name)(args...);             \
-    }                                                                         \
-  };                                                                          \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)    \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
@@ -83,7 +69,7 @@ extern void *rocblas_dso_handle;
   __macro(rocblas_set_pointer_mode);       \
   __macro(rocblas_get_pointer_mode);
 
-ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
 
 // APIs available after CUDA 8.0
 #define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
@@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
   __macro(rocblas_zgemm_strided_batched);     \
   __macro(rocblas_hgemm_strided_batched);
 
-ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
 
 // HIP not supported in ROCM3.5
 // #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
 //   __macro(cublasSetMathMode);
 //   __macro(cublasGetMathMode);
-// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+// ROCBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
 
 #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
   __macro(rocblas_gemm_batched_ex);           \
   __macro(rocblas_gemm_strided_batched_ex);
 
-ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
 
-#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc
index 9ec123b632ffa477e7671cd87667a50af594a968..465420665922dae197c69f86e8b445320576805a 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.cc
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/rocm_driver.h"
+#include "paddle/pten/backends/dynload/rocm_driver.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag rocm_dso_flag;
-void* rocm_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 ROCM_ROUTINE_EACH(DEFINE_WRAP);
 
-bool HasCUDADriver() {
-  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
-  return rocm_dso_handle != nullptr;
-}
+bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
index 4527b6d6e4435e29560dd82339391e8fa637fbf3..2556ca3b337a6e0a224614c453c6e86fad0a8977 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/rocm_driver.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag rocm_dso_flag;
-extern void* rocm_dso_handle;
 extern bool HasCUDADriver();
 
-#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
-      using rocm_func = decltype(&::__name);                             \
-      std::call_once(rocm_dso_flag, []() {                               \
-        rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(rocm_dso_handle, #__name);         \
-      return reinterpret_cast<rocm_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
 
 /**
  * include all needed cuda driver functions
@@ -59,9 +46,9 @@ extern bool HasCUDADriver();
   __macro(hipGetDeviceCount);                                 \
   __macro(hipDevicePrimaryCtxGetState)
 
-ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
 
-#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc
index 4a150048959c52e88515e196390aae57a4e9c12e..48c78a130732eb3055f5c063a184a255ea4adc13 100644
--- a/paddle/fluid/platform/dynload/warpctc.cc
+++ b/paddle/fluid/platform/dynload/warpctc.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index 5f1b7612117ffe9728caf75fd1db3bb8ca1e09f3..ea5adefa1114a01f54056069236fd46e5c40bf80 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -16,34 +16,19 @@ limitations under the License. */
 
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-#include "warpctc/include/ctc.h"
+#include "paddle/pten/backends/dynload/warpctc.h"
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
-
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load warpctc routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
-      using warpctcFunc = decltype(&::__name);                                 \
-      std::call_once(warpctc_dso_flag, []() {                                  \
-        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
-      });                                                                      \
-      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
-    }                                                                          \
-  };                                                                           \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 30930897ea8ca18e8477d88ec12010886b9103f4..32f233e44e952f6c78b7bfbfd3b0c600ac50d5e4 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -65,30 +65,30 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/dynload/curand.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/pten/backends/dynload/curand.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include <error.h>
-#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/pten/backends/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
 #ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/hipfft.h"
-#include "paddle/fluid/platform/dynload/hiprand.h"
-#include "paddle/fluid/platform/dynload/miopen.h"
-#include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/dynload/hipfft.h"
+#include "paddle/pten/backends/dynload/hiprand.h"
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include <error.h>  // NOLINT
-#include "paddle/fluid/platform/dynload/rccl.h"
+#include "paddle/pten/backends/dynload/rccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_HIP
 
@@ -880,7 +880,7 @@ inline bool is_error(cudnnStatus_t stat) {
 inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
   std::ostringstream sout;
   sout << "CUDNN error(" << stat << "), "
-       << platform::dynload::cudnnGetErrorString(stat) << ". "
+       << pten::dynload::cudnnGetErrorString(stat) << ". "
        << GetExternalErrorMsg(stat);
   return sout.str();
 }
@@ -945,7 +945,7 @@ inline bool is_error(ncclResult_t nccl_result) {
 inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
   std::ostringstream sout;
   sout << "NCCL error(" << nccl_result << "), "
-       << platform::dynload::ncclGetErrorString(nccl_result) << ". ";
+       << pten::dynload::ncclGetErrorString(nccl_result) << ". ";
   if (errno == ENOSPC || errno == EAGAIN) {
     std::string detail(strerror(errno));
     detail += "\nPlease try one of the following solutions:";
@@ -1090,7 +1090,7 @@ inline bool is_error(miopenStatus_t stat) {
 
 inline std::string build_rocm_error_msg(miopenStatus_t stat) {
   std::string msg(" Miopen error, ");
-  return msg + platform::dynload::miopenGetErrorString(stat) + " ";
+  return msg + pten::dynload::miopenGetErrorString(stat) + " ";
 }
 
 /***** ROCBLAS ERROR *****/
@@ -1132,7 +1132,7 @@ inline bool is_error(ncclResult_t nccl_result) {
 
 inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
   std::string msg(" Rccl error, ");
-  return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
+  return msg + pten::dynload::ncclGetErrorString(nccl_result) + " ";
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL
 
@@ -1141,7 +1141,7 @@ inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; }
 
 inline std::string build_rocm_error_msg(hipfftResult_t stat) {
   std::string msg(" HIPFFT error, ");
-  return msg + platform::dynload::hipfftGetErrorString(stat) + " ";
+  return msg + pten::dynload::hipfftGetErrorString(stat) + " ";
 }
 
 namespace details {
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index 56633a35116719b617ac7b2dbc6aeac381a251d6..b1132952e72824877b4f83b348be1bca71fe90d9 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 8be774441fe7c00a7d01c198df409ab269ff850a..7a047d790abdc06710659e4388323fd0bf87dba5 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -19,8 +19,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half *in1, const half *in2, half *out) { \
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 07263153164e24c0ffc7acd3d975389ee8bbd7b0..92d218504eaaaa4a8ab9c7caaf56ca7580464cad 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include <unistd.h>
 #elif defined(_MSC_VER)
 #include <processthreadsapi.h>
+#else
+#include <unistd.h>
 #endif
 #include "paddle/fluid/platform/macros.h"  // import DISABLE_COPY_AND_ASSIGN
 
diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h
index c84738247a46f33c697ba1d66c7c56177e60bb91..d8e3b0524f4d0ac76c0af634800d8399aa81bca0 100644
--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #ifdef _POSIX_C_SOURCE
 #include <time.h>
 #endif
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
index 09dcc4369beb06f11a429e4b9ffbae454a34f441..0054968e525eed789544ce8cce4855b6cec1f3dd 100644
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stdlib.h>
 
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 
 #ifdef _WIN32
 static unsigned sleep(unsigned seconds) {
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 3a7043809d96483c3ff19a1442be7d0b28d9cff3..607423d64f53478ac4428393befce815a3b9d540 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -80,6 +80,7 @@ void EmptyEagerTensorInitializer(
         std::make_shared<pten::DenseTensor>(
             pten::make_intrusive<paddle::experimental::SharedStorage>(place),
             pten::DenseTensorMeta(pten::TransToPtenDataType(dtype), ddims));
+    dense_tensor->mutable_data(place);
     self->eager_tensor.set_impl(dense_tensor);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index a454ae807bcaaebc90494db804e17d5791bfcc91..0491363eda78e3cd4c7001981db6e09828f2a34a 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,3 +1,3 @@
 add_subdirectory(lib)
 
-cc_library(pten_api SRCS all.cc DEPS pten_function_api utils_api)
+cc_library(pten_api SRCS all.cc DEPS pten_function_api pten_bw_function_api utils_api)
diff --git a/paddle/pten/api/backward/README.md b/paddle/pten/api/backward/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc934a975f53a01159e46a115363ea58086f358a
--- /dev/null
+++ b/paddle/pten/api/backward/README.md
@@ -0,0 +1 @@
+The code files in this directory(paddle/pten/api/backward) are auto-generated when building PaddlePaddle.
diff --git a/paddle/pten/api/ext/dispatch.h b/paddle/pten/api/ext/dispatch.h
index 07d29ef3e140befe5f28fac92694457921b398a5..945a9557c40e05c93791ce40072ed3875f27d8ea 100644
--- a/paddle/pten/api/ext/dispatch.h
+++ b/paddle/pten/api/ext/dispatch.h
@@ -272,16 +272,10 @@ namespace paddle {
           NAME, ::pten::DataType::UINT8, uint8_t, __VA_ARGS__)                \
       PD_PRIVATE_CASE_TYPE(                                                   \
           NAME, ::pten::DataType::INT16, int16_t, __VA_ARGS__)                \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::pten::DataType::UINT16, uint16_t, __VA_ARGS__)              \
       PD_PRIVATE_CASE_TYPE(                                                   \
           NAME, ::pten::DataType::INT32, int32_t, __VA_ARGS__)                \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::pten::DataType::UINT32, uint32_t, __VA_ARGS__)              \
       PD_PRIVATE_CASE_TYPE(                                                   \
           NAME, ::pten::DataType::INT64, int64_t, __VA_ARGS__)                \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::pten::DataType::UINT64, uint64_t, __VA_ARGS__)              \
       PD_PRIVATE_CASE_TYPE(NAME,                                              \
                            ::pten::DataType::BFLOAT16,                        \
                            paddle::experimental::bfloat16,                    \
diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
index d750b47ef864b404b47551a8501acdaee833bde7..b8e7b0d75bc6cb5d8458c4e0663bc4ff1cd1a732 100644
--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
@@ -115,4 +115,14 @@ using conj_kernel = void (*)(const DeviceContext&,
                              const DenseTensor&,
                              DenseTensor*);
 
+/* -------------- Grad Kernel ----------------- */
+using matmul_grad_kernel = void (*)(const DeviceContext&,
+                                    const DenseTensor&,
+                                    const DenseTensor&,
+                                    const DenseTensor&,
+                                    bool,
+                                    bool,
+                                    DenseTensor*,
+                                    DenseTensor*);
+
 }  // namespace pten
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 1c2b3823920d6e7877842ab48d66f48f4a9af076..1e645a68edfdfa8b09216860cb905a171a0258aa 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -14,18 +14,27 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
 
+# forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
-
 set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/include/api.h)
 set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)
 
+# backward api file
+set(bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py)
+set(bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml)
+set(bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/backward/backward_api.h)
+set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/backward_api.cc)
+set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
+set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
+
 if (NOT PYTHON_EXECUTABLE)
   find_package(PythonInterp REQUIRED)
 endif()
 
+# generate forward api
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
@@ -39,5 +48,19 @@ add_custom_command(
   DEPENDS ${api_yaml_file} ${api_gen_file}
   VERBATIM)
 
+# generate backward api
+add_custom_command(
+  OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} ${bw_api_source_file_tmp}
+  COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file} 
+                 --backward_yaml_path ${bw_api_yaml_file}
+                 --backward_header_path ${bw_api_header_file_tmp}
+                 --backward_source_path ${bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp} ${bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp} ${bw_api_source_file}
+  COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
+  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file}
+  VERBATIM)
+
 cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
 cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch)
+cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta)
diff --git a/paddle/pten/api/lib/api_declare.h b/paddle/pten/api/lib/api_declare.h
index d29050c8ba4a825e161c92f9aa6dac7a86988154..0023170714fa6bfeed4793313833278dc2bbc373 100644
--- a/paddle/pten/api/lib/api_declare.h
+++ b/paddle/pten/api/lib/api_declare.h
@@ -17,8 +17,5 @@ limitations under the License. */
 // api symbols declare, remove in the future
 #include "paddle/pten/api/lib/api_registry.h"
 
-PT_DECLARE_API(Creation);
-PT_DECLARE_API(Linalg);
-PT_DECLARE_API(Manipulation);
 PT_DECLARE_API(Math);
 PT_DECLARE_API(Utils);
diff --git a/paddle/pten/api/lib/api_utils.h b/paddle/pten/api/lib/api_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e7d74db1e89fddc27af6272c47a8e9e05af8bb
--- /dev/null
+++ b/paddle/pten/api/lib/api_utils.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+/* ------------------ for input ----------------------- */
+
+inline std::shared_ptr<pten::DenseTensor> TensorToDenseTensor(
+    const Tensor& tensor) {
+  return std::dynamic_pointer_cast<pten::DenseTensor>(tensor.impl());
+}
+
+inline std::unique_ptr<std::vector<pten::DenseTensor>> TensorToDenseTensor(
+    const std::vector<Tensor>& tensors) {
+  auto pt_tensors = std::make_unique<std::vector<pten::DenseTensor>>();
+  pt_tensors->reserve(tensors.size());
+
+  for (const auto& t : tensors) {
+    pt_tensors->push_back(
+        *std::dynamic_pointer_cast<pten::DenseTensor>(t.impl()));
+  }
+
+  return std::move(pt_tensors);
+}
+
+/* ----------------- for infer_meta --------------------- */
+
+inline const pten::DenseTensorMeta& GetDenseTensorMeta(
+    const pten::DenseTensor& tensor) {
+  return tensor.meta();
+}
+
+inline std::vector<pten::DenseTensorMeta> GetDenseTensorMeta(
+    const std::vector<pten::DenseTensor>& tensors) {
+  std::vector<pten::DenseTensorMeta> metas;
+  metas.reserve(tensors.size());
+  for (const auto& t : tensors) {
+    metas.push_back(t.meta());
+  }
+  return metas;
+}
+
+/* ------------------ for output ----------------------- */
+
+inline pten::DenseTensor* SetKernelOutput(const pten::DenseTensorMeta& meta,
+                                          Backend backend,
+                                          Tensor* out) {
+  auto dense_tensor = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<SharedStorage>(pten::TransToFluidPlace(backend)),
+      meta);
+  out->set_impl(dense_tensor);
+  return dense_tensor.get();
+}
+
+inline std::vector<pten::DenseTensor*> SetKernelOutput(
+    const std::vector<pten::DenseTensorMeta>& metas,
+    Backend backend,
+    std::vector<Tensor>* out) {
+  size_t n = metas.size();
+  out->reserve(n);
+  std::vector<pten::DenseTensor*> results(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto tensor_ptr = std::make_shared<pten::DenseTensor>(
+        pten::make_intrusive<SharedStorage>(pten::TransToFluidPlace(backend)),
+        metas[i]);
+    results[i] = tensor_ptr.get();
+    out->emplace_back();
+    out->back().set_impl(tensor_ptr);
+  }
+  return results;
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 0ccc9c56dbff729061ec759d8e9a627b75a853ef..3389dacec36a5c5515fd95c66f7a39ea27d5fc40 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -149,8 +149,8 @@ bool Tensor::is_cuda() const {
 template <typename T>
 T *Tensor::mutable_data() {
   if (is_dense_tensor()) {
-    return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)
-        ->mutable_data<T>();
+    return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->mutable_data<T>(
+        ConvertExtPlaceToInnerPlace(place()));
   }
   return nullptr;
 }
@@ -173,12 +173,18 @@ Tensor::mutable_data<paddle::platform::float16>();
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
   auto inner_place = ConvertExtPlaceToInnerPlace(place);
-  PADDLE_ENFORCE_EQ(
-      platform::is_same_place(inner_place, impl_->place()),
-      true,
-      platform::errors::Unimplemented("Modification of tensor place through "
-                                      "mutable_data is not supported now"));
-  return mutable_data<T>();
+  if (impl_->initialized()) {
+    PADDLE_ENFORCE_EQ(
+        platform::is_same_place(inner_place, impl_->place()),
+        true,
+        platform::errors::Unimplemented("Modification of tensor place through "
+                                        "mutable_data is not supported now"));
+  }
+  if (is_dense_tensor()) {
+    return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->mutable_data<T>(
+        inner_place);
+  }
+  return nullptr;
 }
 
 template PADDLE_API float *Tensor::mutable_data<float>(const PlaceType &place);
@@ -205,7 +211,8 @@ Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 template <typename T>
 const T *Tensor::data() const {
   if (is_dense_tensor()) {
-    return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->data<T>();
+    return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->mutable_data<T>(
+        ConvertExtPlaceToInnerPlace(place()));
   }
   return nullptr;
 }
@@ -217,7 +224,6 @@ template PADDLE_API const int32_t *Tensor::data<int32_t>() const;
 template PADDLE_API const uint8_t *Tensor::data<uint8_t>() const;
 template PADDLE_API const int8_t *Tensor::data<int8_t>() const;
 template PADDLE_API const int16_t *Tensor::data<int16_t>() const;
-template PADDLE_API const uint16_t *Tensor::data<uint16_t>() const;
 template PADDLE_API const bool *Tensor::data<bool>() const;
 template PADDLE_API const paddle::platform::complex<float>
     *Tensor::data<paddle::platform::complex<float>>() const;
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index f42f3b37f0a1ce47a2677dac413c832675b7efc3..aacbfb15ed738e222f7d99646520412891b4fc01 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -65,6 +65,7 @@ PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           pten::TransToFluidPlace(backend)),
       std::move(out_meta));
+  dense_out->mutable_data(pten::TransToFluidPlace(backend));
   kernel_context.EmplaceBackOutput(dense_out.get());
   Tensor out;
   out.set_impl(dense_out);
diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h
index e102ecbc5de7df86db9031b07b07d2e008d11022..a02e5d46a65c5b97550a67aa743577164a2ec231 100644
--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -39,6 +39,18 @@ class ExternalStorage : public pten::Storage {
     size_ = 0;
   }
 
+  void set_data_shared(
+      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
+    CHECK(holder);
+    data_ = holder;
+    size_ = holder->size();
+  }
+
+  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
+    size_ = 0;
+    return std::move(data_);
+  }
+
   size_t size() const noexcept override { return size_; }
   const paddle::platform::Place& place() const override {
     PADDLE_ENFORCE_NOT_NULL(
@@ -92,6 +104,12 @@ class SharedStorage : public pten::Storage {
     }
   }
 
+  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
+    size_ = 0;
+    place_ = Place();
+    return std::move(data_);
+  }
+
   size_t size() const noexcept override {
     return data_ ? data_->size() : size_;
   }
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 2e94d508aec7df6ea85973118a272130a23db4d6..e9f5ec2d05727adde9cee1c7ad32595f914bbdde 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -311,7 +311,7 @@ void MovesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
       dst,
       platform::errors::InvalidArgument(
           "The destination Tensor is nullptr when move storage."));
-  dst->ResizeAndAllocate(src->dims());
+  dst->Resize(src->dims());
   dst->set_type(pten::TransToProtoVarType(src->dtype()));
   auto storage = src->MoveMemoryHolder();
   dst->ResetHolderWithType(storage, pten::TransToProtoVarType(src->dtype()));
@@ -332,7 +332,7 @@ void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
       dst,
       platform::errors::InvalidArgument(
           "The destination Tensor is nullptr when move allocation."));
-  dst->ResizeAndAllocate(src->dims());
+  dst->Resize(src->dims());
   dst->ResetHolderWithType(src->Holder(),
                            pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
@@ -374,7 +374,7 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
     auto* tensor = variable->GetMutable<framework::LoDTensor>();
 
     auto dtype = pten::TransToProtoVarType(src->dtype());
-    tensor->ResizeAndAllocate(src->dims());
+    tensor->Resize(src->dims());
     SetLoD(tensor->mutable_lod(), src->lod());
 
     if (!tensor->IsInitialized() ||
diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt
index e45adefe652e38e2620b8fb4de357e702ed82211..3587910ff506e572ebeead963015a8c9591388b7 100644
--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -1,2 +1,5 @@
+add_subdirectory(dynload)
+
 add_subdirectory(cpu)
+
 cc_library(pten_context SRCS all_context.cc DEPS device_context)
diff --git a/paddle/pten/backends/dynload/CMakeLists.txt b/paddle/pten/backends/dynload/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7242fc76df7c5db69d58363de6f5427b397aaa6
--- /dev/null
+++ b/paddle/pten/backends/dynload/CMakeLists.txt
@@ -0,0 +1,57 @@
+cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
+
+list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
+
+if (NOT WITH_NV_JETSON)
+  list(APPEND CUDA_SRCS nvjpeg.cc)
+endif()
+
+if (WITH_ROCM)
+  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
+endif()
+
+# There is no macOS version of NCCL.
+# Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+  if (WITH_NCCL)
+    list(APPEND CUDA_SRCS nccl.cc)
+  endif()
+  if (WITH_ROCM)
+    list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
+    if (WITH_RCCL)
+      list(APPEND HIP_SRCS rccl.cc)
+    endif()
+  endif()
+endif()
+
+if (TENSORRT_FOUND)
+  list(APPEND CUDA_SRCS tensorrt.cc)
+endif()
+
+configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
+if (CUPTI_FOUND)
+  list(APPEND CUDA_SRCS cupti.cc)
+endif(CUPTI_FOUND)
+if(WITH_ROCM)
+  hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl)
+else()
+  nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+endif()
+if (WITH_MKLML)
+  cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml)
+endif()
+
+cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader)
+add_dependencies(pten_dynload_lapack extern_lapack)
+# TODO(TJ): add iomp, mkldnn?
+
+if (MKL_FOUND AND WITH_ONEMKL)
+  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
+  cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader)
+  target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+endif()
diff --git a/paddle/pten/backends/dynload/cublas.cc b/paddle/pten/backends/dynload/cublas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1c819346ed24909ede8319200fd772b1e9c5da5
--- /dev/null
+++ b/paddle/pten/backends/dynload/cublas.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cublas.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
+CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
+CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cublas.h b/paddle/pten/backends/dynload/cublas.h
new file mode 100644
index 0000000000000000000000000000000000000000..4748b40a24782482bb7217fc02d0f71a192e4117
--- /dev/null
+++ b/paddle/pten/backends/dynload/cublas.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublasXt.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublas_func =                                                   \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublas_dso_flag, []() {                                \
+        cublas_dso_handle = pten::dynload::GetCublasDsoHandle();            \
+      });                                                                   \
+      static void *p_##__name = dlsym(cublas_dso_handle, #__name);          \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSaxpy_v2);                \
+  __macro(cublasDaxpy_v2);                \
+  __macro(cublasCaxpy_v2);                \
+  __macro(cublasZaxpy_v2);                \
+  __macro(cublasSscal_v2);                \
+  __macro(cublasDscal_v2);                \
+  __macro(cublasScopy_v2);                \
+  __macro(cublasDcopy_v2);                \
+  __macro(cublasSgemv_v2);                \
+  __macro(cublasDgemv_v2);                \
+  __macro(cublasCgemv_v2);                \
+  __macro(cublasZgemv_v2);                \
+  __macro(cublasSgemm_v2);                \
+  __macro(cublasDgemm_v2);                \
+  __macro(cublasCgemm_v2);                \
+  __macro(cublasZgemm_v2);                \
+  __macro(cublasHgemm);                   \
+  __macro(cublasSgemmEx);                 \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
+  __macro(cublasStrsm_v2);                \
+  __macro(cublasDtrsm_v2);                \
+  __macro(cublasCtrsm_v2);                \
+  __macro(cublasZtrsm_v2);                \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasStrsmBatched);            \
+  __macro(cublasDtrsmBatched);            \
+  __macro(cublasCtrsmBatched);            \
+  __macro(cublasZtrsmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched);           \
+  __macro(cublasSmatinvBatched);          \
+  __macro(cublasDmatinvBatched);          \
+  __macro(cublasSgetrsBatched);           \
+  __macro(cublasDgetrsBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+// APIs available after CUDA 8.0
+#if CUDA_VERSION >= 8000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+// APIs available after CUDA 9.0
+#if CUDA_VERSION >= 9000
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+  __macro(cublasSetMathMode);                \
+  __macro(cublasGetMathMode);
+
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+// APIs available after CUDA 9.1
+#if CUDA_VERSION >= 9010
+#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(cublasGemmBatchedEx);              \
+  __macro(cublasGemmStridedBatchedEx);
+
+CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cublasLt.cc b/paddle/pten/backends/dynload/cublasLt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d6ce6c7b958388922f4993e37e5562a15d39679
--- /dev/null
+++ b/paddle/pten/backends/dynload/cublasLt.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cublasLt.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag cublasLt_dso_flag;
+void *cublasLt_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cublasLt.h b/paddle/pten/backends/dynload/cublasLt.h
new file mode 100644
index 0000000000000000000000000000000000000000..68c23b30b667e4487dd476f3ad8d7cda0274e3de
--- /dev/null
+++ b/paddle/pten/backends/dynload/cublasLt.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublasLt.h>
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cublasLt_dso_flag;
+extern void *cublasLt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublasLt routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublasLt_func =                                                 \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublasLt_dso_flag, []() {                              \
+        cublasLt_dso_handle = pten::dynload::GetCublasLtDsoHandle();        \
+      });                                                                   \
+      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
+      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+// APIs available after CUDA 10.1
+// #if CUDA_VERSION >= 10100
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)    \
+  __macro(cublasLtCreate);                     \
+  __macro(cublasLtDestroy);                    \
+  __macro(cublasLtMatmul);                     \
+  __macro(cublasLtMatmulDescCreate);           \
+  __macro(cublasLtMatmulDescDestroy);          \
+  __macro(cublasLtMatmulDescSetAttribute);     \
+  __macro(cublasLtMatrixLayoutCreate);         \
+  __macro(cublasLtMatrixLayoutDestroy);        \
+  __macro(cublasLtMatrixLayoutSetAttribute);   \
+  __macro(cublasLtMatrixTransform);            \
+  __macro(cublasLtMatrixTransformDescCreate);  \
+  __macro(cublasLtMatrixTransformDescDestroy); \
+  __macro(cublasLtMatrixTransformDescSetAttribute);
+
+CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+// #endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cuda_driver.cc b/paddle/pten/backends/dynload/cuda_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae72a6a5740214396cc3dd35c996d5a336942130
--- /dev/null
+++ b/paddle/pten/backends/dynload/cuda_driver.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cuda_driver.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cuda_dso_flag;
+void* cuda_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#if CUDA_VERSION >= 10020
+CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
+#endif
+CUDA_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() {
+  std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
+  return cuda_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cuda_driver.h b/paddle/pten/backends/dynload/cuda_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b91d7049a910f353661bbd8e01653ef4fb95cbd
--- /dev/null
+++ b/paddle/pten/backends/dynload/cuda_driver.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cuda.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cuda_dso_flag;
+extern void* cuda_dso_handle;
+extern bool HasCUDADriver();
+
+#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cuda_func = decltype(&::__name);                         \
+      std::call_once(cuda_dso_flag, []() {                           \
+        cuda_dso_handle = pten::dynload::GetCUDADsoHandle();         \
+      });                                                            \
+      static void* p_##__name = dlsym(cuda_dso_handle, #__name);     \
+      return reinterpret_cast<cuda_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cuda driver functions
+ **/
+#define CUDA_ROUTINE_EACH(__macro)                      \
+  __macro(cuInit);                                      \
+  __macro(cuDriverGetVersion);                          \
+  __macro(cuGetErrorString);                            \
+  __macro(cuModuleLoadData);                            \
+  __macro(cuModuleGetFunction);                         \
+  __macro(cuModuleUnload);                              \
+  __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(cuLaunchKernel);                              \
+  __macro(cuCtxCreate);                                 \
+  __macro(cuCtxGetCurrent);                             \
+  __macro(cuDeviceGetCount);                            \
+  __macro(cuDevicePrimaryCtxGetState);                  \
+  __macro(cuDeviceGetAttribute);                        \
+  __macro(cuDeviceGet)
+
+#if CUDA_VERSION >= 10020
+#define CUDA_ROUTINE_EACH_VVM(__macro)    \
+  __macro(cuMemGetAllocationGranularity); \
+  __macro(cuMemAddressReserve);           \
+  __macro(cuMemCreate);                   \
+  __macro(cuMemMap);                      \
+  __macro(cuMemSetAccess);                \
+  __macro(cuMemUnmap);                    \
+  __macro(cuMemRelease);                  \
+  __macro(cuMemAddressFree)
+
+CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+#endif
+
+CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cudnn.cc b/paddle/pten/backends/dynload/cudnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67447e7359ffce69b6145d565bcddff3b2c4e147
--- /dev/null
+++ b/paddle/pten/backends/dynload/cudnn.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R8
+CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
+#endif
+
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cudnn_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load cudnn shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cudnn.h b/paddle/pten/backends/dynload/cudnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e084dfe3a6b54cc5ce881fe6f3113ae6038a245
--- /dev/null
+++ b/paddle/pten/backends/dynload/cudnn.h
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cudnn_func = decltype(&::__name);                        \
+      std::call_once(cudnn_dso_flag, []() {                          \
+        cudnn_dso_handle = pten::dynload::GetCUDNNDsoHandle();       \
+      });                                                            \
+      EnforceCUDNNLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);    \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
+  __macro(cudnnSetTensor4dDescriptor);                     \
+  __macro(cudnnSetTensor4dDescriptorEx);                   \
+  __macro(cudnnSetTensorNdDescriptor);                     \
+  __macro(cudnnGetTensorNdDescriptor);                     \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
+  __macro(cudnnCreateTensorDescriptor);                    \
+  __macro(cudnnDestroyTensorDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);                     \
+  __macro(cudnnSetFilterNdDescriptor);                     \
+  __macro(cudnnGetFilterNdDescriptor);                     \
+  __macro(cudnnSetPooling2dDescriptor);                    \
+  __macro(cudnnSetPoolingNdDescriptor);                    \
+  __macro(cudnnGetPoolingNdDescriptor);                    \
+  __macro(cudnnDestroyFilterDescriptor);                   \
+  __macro(cudnnCreateConvolutionDescriptor);               \
+  __macro(cudnnCreatePoolingDescriptor);                   \
+  __macro(cudnnDestroyPoolingDescriptor);                  \
+  __macro(cudnnSetConvolution2dDescriptor);                \
+  __macro(cudnnDestroyConvolutionDescriptor);              \
+  __macro(cudnnSetConvolutionNdDescriptor);                \
+  __macro(cudnnGetConvolutionNdDescriptor);                \
+  __macro(cudnnDeriveBNTensorDescriptor);                  \
+  __macro(cudnnCreateSpatialTransformerDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
+  __macro(cudnnDestroySpatialTransformerDescriptor);       \
+  __macro(cudnnSpatialTfGridGeneratorForward);             \
+  __macro(cudnnSpatialTfGridGeneratorBackward);            \
+  __macro(cudnnSpatialTfSamplerForward);                   \
+  __macro(cudnnSpatialTfSamplerBackward);                  \
+  __macro(cudnnCreate);                                    \
+  __macro(cudnnDestroy);                                   \
+  __macro(cudnnSetStream);                                 \
+  __macro(cudnnActivationForward);                         \
+  __macro(cudnnActivationBackward);                        \
+  __macro(cudnnConvolutionForward);                        \
+  __macro(cudnnConvolutionBackwardBias);                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
+  __macro(cudnnTransformTensor);                           \
+  __macro(cudnnPoolingForward);                            \
+  __macro(cudnnPoolingBackward);                           \
+  __macro(cudnnSoftmaxBackward);                           \
+  __macro(cudnnSoftmaxForward);                            \
+  __macro(cudnnGetVersion);                                \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
+  __macro(cudnnGetErrorString);                            \
+  __macro(cudnnCreateDropoutDescriptor);                   \
+  __macro(cudnnDropoutGetStatesSize);                      \
+  __macro(cudnnSetDropoutDescriptor);                      \
+  __macro(cudnnRestoreDropoutDescriptor);                  \
+  __macro(cudnnCreateRNNDescriptor);                       \
+  __macro(cudnnGetRNNParamsSize);                          \
+  __macro(cudnnGetRNNWorkspaceSize);                       \
+  __macro(cudnnGetRNNTrainingReserveSize);                 \
+  __macro(cudnnRNNForwardTraining);                        \
+  __macro(cudnnRNNBackwardData);                           \
+  __macro(cudnnRNNBackwardWeights);                        \
+  __macro(cudnnRNNForwardInference);                       \
+  __macro(cudnnDestroyDropoutDescriptor);                  \
+  __macro(cudnnDestroyRNNDescriptor);                      \
+  __macro(cudnnSetTensorNdDescriptorEx);                   \
+  __macro(cudnnAddTensor);                                 \
+  __macro(cudnnConvolutionBackwardData);                   \
+  __macro(cudnnConvolutionBackwardFilter);                 \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
+  __macro(cudnnBatchNormalizationForwardTraining);         \
+  __macro(cudnnBatchNormalizationForwardInference);        \
+  __macro(cudnnBatchNormalizationBackward);                \
+  __macro(cudnnCreateActivationDescriptor);                \
+  __macro(cudnnSetActivationDescriptor);                   \
+  __macro(cudnnGetActivationDescriptor);                   \
+  __macro(cudnnDestroyActivationDescriptor);               \
+  __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);          \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
+  __macro(cudnnSetRNNDescriptor);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
+  __macro(cudnnSetConvolutionGroupCount);                 \
+  __macro(cudnnSetConvolutionMathType);                   \
+  __macro(cudnnConvolutionBiasActivationForward);         \
+  __macro(cudnnCreateCTCLossDescriptor);                  \
+  __macro(cudnnDestroyCTCLossDescriptor);                 \
+  __macro(cudnnGetCTCLossDescriptor);                     \
+  __macro(cudnnSetCTCLossDescriptor);                     \
+  __macro(cudnnGetCTCLossWorkspaceSize);                  \
+  __macro(cudnnCTCLoss);                                  \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
+  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7201
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
+  __macro(cudnnCreateRNNDataDescriptor);             \
+  __macro(cudnnDestroyRNNDataDescriptor);            \
+  __macro(cudnnSetRNNDataDescriptor);                \
+  __macro(cudnnSetRNNPaddingMode);                   \
+  __macro(cudnnRNNForwardTrainingEx);                \
+  __macro(cudnnRNNBackwardDataEx);                   \
+  __macro(cudnnRNNBackwardWeightsEx);                \
+  __macro(cudnnRNNForwardInferenceEx);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7401
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
+  __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
+  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
+  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
+  __macro(cudnnBatchNormalizationBackwardEx);                        \
+  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 8000
+#define CUDNN_DNN_ROUTINE_EACH_R8(__macro)            \
+  __macro(cudnnSetRNNDescriptor_v8);                  \
+  __macro(cudnnCreateFusedOpsPlan);                   \
+  __macro(cudnnCreateFusedOpsConstParamPack);         \
+  __macro(cudnnCreateFusedOpsVariantParamPack);       \
+  __macro(cudnnDestroyFusedOpsPlan);                  \
+  __macro(cudnnDestroyFusedOpsConstParamPack);        \
+  __macro(cudnnDestroyFusedOpsVariantParamPack);      \
+  __macro(cudnnFusedOpsExecute);                      \
+  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
+  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
+  __macro(cudnnMakeFusedOpsPlan);
+CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/dynload/cufft.cc b/paddle/pten/backends/dynload/cufft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e146690e8dc0b4349bce4fb1277692a913e5c82
--- /dev/null
+++ b/paddle/pten/backends/dynload/cufft.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cufft.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag cufft_dso_flag;
+void* cufft_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUFFT() {
+  std::call_once(cufft_dso_flag,
+                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
+  return cufft_dso_handle != nullptr;
+}
+
+void EnforceCUFFTLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cufft_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load cufft shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cufft.h b/paddle/pten/backends/dynload/cufft.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b0780b73160f6cb4d630e871885dfc4e59afee3
--- /dev/null
+++ b/paddle/pten/backends/dynload/cufft.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cufft.h>
+#include <cufftXt.h>
+#include <glog/logging.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cufft_dso_flag;
+extern void* cufft_dso_handle;
+extern bool HasCUFFT();
+
+extern void EnforceCUFFTLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cufft_func = decltype(&::__name);                        \
+      std::call_once(cufft_dso_flag, []() {                          \
+        cufft_dso_handle = pten::dynload::GetCUFFTDsoHandle();       \
+      });                                                            \
+      EnforceCUFFTLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
+      return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cufft functions in HPPL
+ * different cufft version has different interfaces
+ **/
+#define CUFFT_FFT_ROUTINE_EACH(__macro)  \
+  __macro(cufftPlan1d);                  \
+  __macro(cufftPlan2d);                  \
+  __macro(cufftPlan3d);                  \
+  __macro(cufftPlanMany);                \
+  __macro(cufftMakePlan1d);              \
+  __macro(cufftMakePlan2d);              \
+  __macro(cufftMakePlan3d);              \
+  __macro(cufftMakePlanMany);            \
+  __macro(cufftMakePlanMany64);          \
+  __macro(cufftGetSizeMany64);           \
+  __macro(cufftEstimate1d);              \
+  __macro(cufftEstimate2d);              \
+  __macro(cufftEstimate3d);              \
+  __macro(cufftEstimateMany);            \
+  __macro(cufftCreate);                  \
+  __macro(cufftGetSize1d);               \
+  __macro(cufftGetSize2d);               \
+  __macro(cufftGetSize3d);               \
+  __macro(cufftGetSizeMany);             \
+  __macro(cufftGetSize);                 \
+  __macro(cufftSetWorkArea);             \
+  __macro(cufftSetAutoAllocation);       \
+  __macro(cufftExecC2C);                 \
+  __macro(cufftExecR2C);                 \
+  __macro(cufftExecC2R);                 \
+  __macro(cufftExecZ2Z);                 \
+  __macro(cufftExecD2Z);                 \
+  __macro(cufftExecZ2D);                 \
+  __macro(cufftSetStream);               \
+  __macro(cufftDestroy);                 \
+  __macro(cufftGetVersion);              \
+  __macro(cufftGetProperty);             \
+  __macro(cufftXtSetGPUs);               \
+  __macro(cufftXtMalloc);                \
+  __macro(cufftXtMemcpy);                \
+  __macro(cufftXtFree);                  \
+  __macro(cufftXtSetWorkArea);           \
+  __macro(cufftXtExecDescriptorC2C);     \
+  __macro(cufftXtExecDescriptorR2C);     \
+  __macro(cufftXtExecDescriptorC2R);     \
+  __macro(cufftXtExecDescriptorZ2Z);     \
+  __macro(cufftXtExecDescriptorD2Z);     \
+  __macro(cufftXtExecDescriptorZ2D);     \
+  __macro(cufftXtQueryPlan);             \
+  __macro(cufftXtSetCallback);           \
+  __macro(cufftXtClearCallback);         \
+  __macro(cufftXtSetCallbackSharedSize); \
+  __macro(cufftXtMakePlanMany);          \
+  __macro(cufftXtGetSizeMany);           \
+  __macro(cufftXtExec);                  \
+  __macro(cufftXtExecDescriptor);        \
+  __macro(cufftXtSetWorkAreaPolicy);
+
+CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/dynload/cupti.cc b/paddle/pten/backends/dynload/cupti.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91d202dbff410d3c76b078068966f9698dc4aa64
--- /dev/null
+++ b/paddle/pten/backends/dynload/cupti.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include "paddle/pten/backends/dynload/cupti.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cupti_dso_flag;
+void *cupti_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUPTI_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/pten/backends/dynload/cupti.h b/paddle/pten/backends/dynload/cupti.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbc6993f458c291a0114f74f7a435dd1e611a59d
--- /dev/null
+++ b/paddle/pten/backends/dynload/cupti.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include <cuda.h>
+#include <cupti.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                   \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {        \
+      using cuptiFunc = decltype(&::__name);                      \
+      std::call_once(cupti_dso_flag, []() {                       \
+        cupti_dso_handle = pten::dynload::GetCUPTIDsoHandle();    \
+      });                                                         \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);    \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);
+
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+}  // namespace dynload
+}  // namespace pten
+
+#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/pten/backends/dynload/cupti_lib_path.h.in b/paddle/pten/backends/dynload/cupti_lib_path.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..017384bfbb7eb6db3927894f652c11ddb07cebc5
--- /dev/null
+++ b/paddle/pten/backends/dynload/cupti_lib_path.h.in
@@ -0,0 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
diff --git a/paddle/pten/backends/dynload/curand.cc b/paddle/pten/backends/dynload/curand.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7a3218f44caa0e75dc9d0b5b0d0f39154a447aa9
--- /dev/null
+++ b/paddle/pten/backends/dynload/curand.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/curand.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/curand.h b/paddle/pten/backends/dynload/curand.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ab8d179c37986ee4c9132d35a9529a4eefc8187
--- /dev/null
+++ b/paddle/pten/backends/dynload/curand.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <curand.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                   \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(Args... args) {                      \
+      using curandFunc = decltype(&::__name);                      \
+      std::call_once(curand_dso_flag, []() {                       \
+        curand_dso_handle = pten::dynload::GetCurandDsoHandle();   \
+      });                                                          \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name); \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
+  __macro(curandDestroyGenerator);
+
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cusolver.cc b/paddle/pten/backends/dynload/cusolver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..581aaabd8ae98b9fb055f9f52812a72a1cf5e401
--- /dev/null
+++ b/paddle/pten/backends/dynload/cusolver.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cusolver.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cusolver_dso_flag;
+void *cusolver_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef CUSOLVER_ROUTINE_EACH_R1
+CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP);
+#endif
+
+#ifdef CUSOLVER_ROUTINE_EACH_R2
+CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cusolver.h b/paddle/pten/backends/dynload/cusolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..f352686a009c83b8a757b7e4d19f469ec1377ba0
--- /dev/null
+++ b/paddle/pten/backends/dynload/cusolver.h
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusolverDn.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag cusolver_dso_flag;
+extern void *cusolver_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    cusolverStatus_t operator()(Args... args) {                      \
+      using cusolverFunc = decltype(&::__name);                      \
+      std::call_once(cusolver_dso_flag, []() {                       \
+        cusolver_dso_handle = pten::dynload::GetCusolverDsoHandle(); \
+      });                                                            \
+      static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
+      return reinterpret_cast<cusolverFunc>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define CUSOLVER_ROUTINE_EACH(__macro)  \
+  __macro(cusolverDnCreate);            \
+  __macro(cusolverDnDestroy);           \
+  __macro(cusolverDnSetStream);         \
+  __macro(cusolverDnSpotrf_bufferSize); \
+  __macro(cusolverDnDpotrf_bufferSize); \
+  __macro(cusolverDnSpotrf);            \
+  __macro(cusolverDnDpotrf);            \
+  __macro(cusolverDnSpotrs);            \
+  __macro(cusolverDnDpotrs);            \
+  __macro(cusolverDnCpotrs);            \
+  __macro(cusolverDnZpotrs);            \
+  __macro(cusolverDnSsyevd_bufferSize); \
+  __macro(cusolverDnDsyevd_bufferSize); \
+  __macro(cusolverDnCheevd_bufferSize); \
+  __macro(cusolverDnZheevd_bufferSize); \
+  __macro(cusolverDnSsyevd);            \
+  __macro(cusolverDnDsyevd);            \
+  __macro(cusolverDnCheevd);            \
+  __macro(cusolverDnZheevd);
+
+CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
+
+#if CUDA_VERSION >= 9020
+#define CUSOLVER_ROUTINE_EACH_R1(__macro) \
+  __macro(cusolverDnSpotrfBatched);       \
+  __macro(cusolverDnDpotrfBatched);       \
+  __macro(cusolverDnSpotrsBatched);       \
+  __macro(cusolverDnDpotrsBatched);       \
+  __macro(cusolverDnSgesvdj_bufferSize);  \
+  __macro(cusolverDnSgetrf_bufferSize);   \
+  __macro(cusolverDnDgetrf_bufferSize);   \
+  __macro(cusolverDnCgetrf_bufferSize);   \
+  __macro(cusolverDnZgetrf_bufferSize);   \
+  __macro(cusolverDnSgeqrf_bufferSize);   \
+  __macro(cusolverDnDgeqrf_bufferSize);   \
+  __macro(cusolverDnCgeqrf_bufferSize);   \
+  __macro(cusolverDnZgeqrf_bufferSize);   \
+  __macro(cusolverDnSorgqr_bufferSize);   \
+  __macro(cusolverDnDorgqr_bufferSize);   \
+  __macro(cusolverDnSormqr_bufferSize);   \
+  __macro(cusolverDnDormqr_bufferSize);   \
+  __macro(cusolverDnCungqr_bufferSize);   \
+  __macro(cusolverDnZungqr_bufferSize);   \
+  __macro(cusolverDnDestroyGesvdjInfo);   \
+  __macro(cusolverDnCreateGesvdjInfo);    \
+  __macro(cusolverDnDgesvdj_bufferSize);  \
+  __macro(cusolverDnSgesvdj);             \
+  __macro(cusolverDnDgesvdj);             \
+  __macro(cusolverDnSgetrf);              \
+  __macro(cusolverDnDgetrf);              \
+  __macro(cusolverDnCgetrf);              \
+  __macro(cusolverDnZgetrf);              \
+  __macro(cusolverDnSgeqrf);              \
+  __macro(cusolverDnDgeqrf);              \
+  __macro(cusolverDnCgeqrf);              \
+  __macro(cusolverDnZgeqrf);              \
+  __macro(cusolverDnSorgqr);              \
+  __macro(cusolverDnDorgqr);              \
+  __macro(cusolverDnSormqr);              \
+  __macro(cusolverDnDormqr);              \
+  __macro(cusolverDnCungqr);              \
+  __macro(cusolverDnZungqr);
+
+CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+#endif
+
+#if CUDA_VERSION >= 9020
+#define CUSOLVER_ROUTINE_EACH_R2(__macro) \
+  __macro(cusolverDnCreateSyevjInfo);     \
+  __macro(cusolverDnSsyevj_bufferSize);   \
+  __macro(cusolverDnDsyevj_bufferSize);   \
+  __macro(cusolverDnSsyevj);              \
+  __macro(cusolverDnDsyevj);              \
+  __macro(cusolverDnDestroySyevjInfo);
+
+CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cusparse.cc b/paddle/pten/backends/dynload/cusparse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4277f14149f0c4e506df83144619fcbb60160785
--- /dev/null
+++ b/paddle/pten/backends/dynload/cusparse.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cusparse.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cusparse_dso_flag;
+void *cusparse_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#ifdef CUSPARSE_ROUTINE_EACH
+CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+#endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef CUSPARSE_ROUTINE_EACH_11020
+CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/cusparse.h b/paddle/pten/backends/dynload/cusparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9a7fab87799e411026b7c9363563ff2a4fa836b
--- /dev/null
+++ b/paddle/pten/backends/dynload/cusparse.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusparse.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag cusparse_dso_flag;
+extern void *cusparse_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    cusparseStatus_t operator()(Args... args) {                      \
+      using cusparseFunc = decltype(&::__name);                      \
+      std::call_once(cusparse_dso_flag, []() {                       \
+        cusparse_dso_handle = pten::dynload::GetCusparseDsoHandle(); \
+      });                                                            \
+      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
+      return reinterpret_cast<cusparseFunc>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+#define CUSPARSE_ROUTINE_EACH(__macro) \
+  __macro(cusparseCreate);             \
+  __macro(cusparseSetStream);          \
+  __macro(cusparseCreateMatDescr);     \
+  __macro(cusparseDestroy);            \
+  __macro(cusparseSnnz);               \
+  __macro(cusparseDnnz);               \
+  __macro(cusparseSetMatType);         \
+  __macro(cusparseSetMatIndexBase);
+
+CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
+
+// APIs available after CUDA 11.2
+#if CUDA_VERSION >= 11020
+#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
+  __macro(cusparseCreateCsr);                \
+  __macro(cusparseCreateCoo);                \
+  __macro(cusparseCreateDnMat);              \
+  __macro(cusparseSpMM_bufferSize);          \
+  __macro(cusparseSpMM);                     \
+  __macro(cusparseDestroySpMat);             \
+  __macro(cusparseDestroyDnMat);             \
+  __macro(cusparseCooSetPointers);           \
+  __macro(cusparseCsrSetPointers);           \
+  __macro(cusparseDenseToSparse_bufferSize); \
+  __macro(cusparseDenseToSparse_analysis);   \
+  __macro(cusparseDenseToSparse_convert);    \
+  __macro(cusparseSparseToDense_bufferSize); \
+  __macro(cusparseSparseToDense);
+
+CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+
+// APIs available after CUDA 11.3
+#if CUDA_VERSION >= 11030
+#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
+  __macro(cusparseSDDMM_bufferSize);      \
+  __macro(cusparseSDDMM_preprocess);      \
+  __macro(cusparseSDDMM);
+
+CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
+#endif
+#endif
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/dynamic_loader.cc b/paddle/pten/backends/dynload/dynamic_loader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2817b1520e215b0feb18b39a55fba500421fe753
--- /dev/null
+++ b/paddle/pten/backends/dynload/dynamic_loader.cc
@@ -0,0 +1,585 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cupti_lib_path.h"
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+// TODO(wilber): The pten computing library requires a component to manage flags
+// (maybe not use gflags).
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(
+    cuda_dir,
+    "",
+    "Specify path for loading cuda library, such as libcublas, libcublasLt "
+    "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
+    "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(nccl_dir,
+              "",
+              "Specify path for loading nccl library, such as libnccl.so. "
+              "For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(hccl_dir,
+              "",
+              "Specify path for loading hccl library, such as libhccl.so. "
+              "For instance, "
+              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
+              "default, "
+              "dlopen will search hccl from LD_LIBRARY_PATH");
+
+DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+
+DEFINE_string(
+    tensorrt_dir,
+    "",
+    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+
+DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
+DEFINE_string(mkl_dir,
+              "",
+              "Specify path for loading libmkl_rt.so. "
+              "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/."
+              "If default, "
+              "dlopen will search mkl from LD_LIBRARY_PATH");
+
+DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
+
+#ifdef PADDLE_WITH_HIP
+
+DEFINE_string(miopen_dir,
+              "",
+              "Specify path for loading libMIOpen.so. For instance, "
+              "/opt/rocm/miopen/lib. If empty [default], dlopen "
+              "will search miopen from LD_LIBRARY_PATH");
+
+DEFINE_string(rocm_dir,
+              "",
+              "Specify path for loading rocm library, such as librocblas, "
+              "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
+              "If default, dlopen will search rocm from LD_LIBRARY_PATH");
+
+DEFINE_string(rccl_dir,
+              "",
+              "Specify path for loading rccl library, such as librccl.so. "
+              "For instance, /opt/rocm/rccl/lib. If default, "
+              "dlopen will search rccl from LD_LIBRARY_PATH");
+#endif
+
+namespace pten {
+namespace dynload {
+
+struct PathNode {
+  PathNode() {}
+  std::string path = "";
+};
+
+static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
+
+// NOTE: In order to adapt to the default installation path of cuda
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
+#else
+static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";
+#endif
+
+static PathNode s_py_site_pkg_path;
+
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
+static constexpr char* win_cublas_lib =
+    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
+#if CUDA_VERSION >= 11000
+static constexpr char* win_curand_lib =
+    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
+static constexpr char* win_cusolver_lib =
+    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
+static constexpr char* win_cusparse_lib =
+    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
+static constexpr char* win_cufft_lib =
+    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
+#else
+static constexpr char* win_curand_lib =
+    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cusolver_lib =
+    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cusparse_lib =
+    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cufft_lib =
+    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
+#endif  // CUDA_VERSION
+#endif
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline std::vector<std::string> split(
+    const std::string& str, const std::string separator = " ") {
+  std::vector<std::string> str_list;
+  std::string::size_type firstPos;
+  firstPos = str.find_first_not_of(separator, 0);
+  std::string::size_type lastPos;
+  lastPos = str.find_first_of(separator, firstPos);
+  while (std::string::npos != firstPos && std::string::npos != lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+    firstPos = str.find_first_not_of(separator, lastPos);
+    lastPos = str.find_first_of(separator, firstPos);
+  }
+  if (std::string::npos == lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+  }
+  return str_list;
+}
+
+void SetPaddleLibPath(const std::string& py_site_pkg_path) {
+  s_py_site_pkg_path.path = py_site_pkg_path;
+  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
+}
+
+static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
+                                                 const std::string& dso_name,
+                                                 int dynload_flags) {
+  void* dso_handle = nullptr;
+  if (!spec_path.empty()) {
+    // search xxx.so from custom path
+    VLOG(3) << "Try to find library: " << dso_name
+            << " from specific path: " << spec_path;
+    std::string dso_path = join(spec_path, dso_name);
+    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  }
+  return dso_handle;
+}
+
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  // and /usr/local/lib path
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+
+// TODO(chenweihang): This path is used to search which libs?
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
+  }
+#endif
+
+  return dso_handle;
+}
+
+/*
+ * We define three priorities for dynamic library search:
+ *
+ * First: Search for  path specified by the user
+ * Second: Search the stheystem default path
+ * Third: Search for a special path corresponding to
+ *        a specific library to adapt to changes and easy to expand.
+ */
+
+static inline void* GetDsoHandleFromSearchPath(
+    const std::string& config_path,
+    const std::string& dso_name,
+    bool throw_on_error = true,
+    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
+    const std::string& warning_msg = std::string()) {
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+  std::vector<std::string> dso_names = split(dso_name, ";");
+  void* dso_handle = nullptr;
+  for (auto dso : dso_names) {
+    // 1. search in user config path by FLAGS
+    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
+    // 2. search in system default path
+    if (nullptr == dso_handle) {
+      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
+    }
+    // 3. search in extra paths
+    if (nullptr == dso_handle) {
+      for (auto path : extra_paths) {
+        VLOG(3) << "extra_paths: " << path;
+        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
+      }
+    }
+    if (nullptr != dso_handle) break;
+  }
+
+  // 4. [If Failed for All dso_names] logging warning if exists
+  if (nullptr == dso_handle && !warning_msg.empty()) {
+    LOG(WARNING) << warning_msg;
+  }
+
+  // 5. [If Failed for All dso_names] logging or throw error info
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "The third-party dynamic library (%s) that Paddle depends on is not "
+        "configured correctly. (error code is %s)\n"
+        "  Suggestions:\n"
+        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
+        "is installed correctly and its version is matched with paddlepaddle "
+        "you installed.\n"
+        "  2. Configure third-party dynamic library environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
+        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.]";
+#if !defined(_WIN32)
+    auto errorno = dlerror();
+#else
+    auto errorno = GetLastError();
+#endif  // !_WIN32
+    if (throw_on_error) {
+      // NOTE: Special error report case, no need to change its format
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          error_msg, dso_name, errorno));
+    } else {
+      LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
+    }
+  }
+
+  return dso_handle;
+}
+
+void* GetCublasDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
+#endif
+}
+
+void* GetCublasLtDsoHandle() {
+// APIs available after CUDA 10.1
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
+#else
+  std::string warning_msg(
+      "Your CUDA_VERSION less 10.1, not support CublasLt. "
+      "If you want to use CublasLt, please upgrade CUDA and rebuild "
+      "PaddlePaddle.");
+  return nullptr;
+#endif
+}
+
+void* GetCUDNNDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string mac_warn_meg(
+      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+      "For instance, sudo tar -xzf "
+      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+      "chmod a+r /usr/local/cuda/include/cudnn.h "
+      "/usr/local/cuda/lib/libcudnn*");
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  std::string win_warn_meg(
+      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
+      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
+      "NVIDIA's official website, \n"
+      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
+      "Toolkit\\CUDA\\v10.0\n"
+      "You should do this according to your CUDA installation directory and "
+      "CUDNN version.");
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
+#endif
+}
+
+void* GetCUPTIDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+}
+
+void* GetCurandDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
+#endif
+}
+
+#ifdef PADDLE_WITH_HIP
+void* GetROCFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
+#endif
+}
+#endif
+
+void* GetNvjpegDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
+#endif
+}
+
+void* GetCusolverDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
+#endif
+}
+
+void* GetCusparseDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
+#endif
+}
+
+void* GetNVRTCDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
+#endif
+}
+
+void* GetCUDADsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(_WIN32)
+  char system32_dir[MAX_PATH];
+  GetSystemDirectory(system32_dir, MAX_PATH);
+  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
+#endif
+}
+
+void* GetWarpCTCDsoHandle() {
+  std::string warpctc_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    warpctc_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
+#else
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
+#endif
+}
+
+void* GetNCCLDsoHandle() {
+#ifdef PADDLE_WITH_HIP
+  std::string warning_msg(
+      "You may need to install 'rccl' from ROCM official website: "
+      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
+      "Installation-Guide.html before install PaddlePaddle.");
+#else
+  std::string warning_msg(
+      "You may need to install 'nccl2' from NVIDIA official website: "
+      "https://developer.nvidia.com/nccl/nccl-download"
+      "before install PaddlePaddle.");
+#endif
+
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
+#endif
+}
+void* GetHCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'hccl2' from Huawei official website: "
+      "before install PaddlePaddle.");
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_hccl_dir, "libhccl.so", true, {}, warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
+#endif
+}
+
+void* GetTensorRtDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+#endif
+}
+
+void* GetMKLMLDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+#endif
+}
+
+void* GetLAPACKDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
+#endif
+}
+
+void* GetOpDsoHandle(const std::string& dso_name) {
+  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
+}
+
+void* GetNvtxDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("Nvtx do not support Apple."));
+#elif defined(_WIN32)
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("Nvtx do not support Windows."));
+#elif !defined(PADDLE_WITH_CUDA)
+  PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      "Nvtx do not support without CUDA."));
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
+#endif
+}
+
+void* GetCUFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
+#endif
+}
+
+void* GetMKLRTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
+#endif
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/dynamic_loader.h b/paddle/pten/backends/dynload/dynamic_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7c7a87d33d6742374b6bab916df29ca52bab7dc
--- /dev/null
+++ b/paddle/pten/backends/dynload/dynamic_loader.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+namespace pten {
+namespace dynload {
+
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+
+void* GetCublasDsoHandle();
+void* GetCublasLtDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetNvjpegDsoHandle();
+void* GetCusolverDsoHandle();
+void* GetCusparseDsoHandle();
+void* GetNVRTCDsoHandle();
+void* GetCUDADsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
+void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
+void* GetLAPACKDsoHandle();
+void* GetOpDsoHandle(const std::string& dso_name);
+void* GetNvtxDsoHandle();
+void* GetCUFFTDsoHandle();
+void* GetMKLRTDsoHandle();
+void* GetROCFFTDsoHandle();
+
+void SetPaddleLibPath(const std::string&);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/hipfft.cc b/paddle/pten/backends/dynload/hipfft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1d802fac045986e6973a0d3fa09d001d9cb9ff7
--- /dev/null
+++ b/paddle/pten/backends/dynload/hipfft.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/hipfft.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag hipfft_dso_flag;
+void *hipfft_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/hipfft.h b/paddle/pten/backends/dynload/hipfft.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6f4e6ca8ceb0512e00a134e77dd8c4e0b3659e7
--- /dev/null
+++ b/paddle/pten/backends/dynload/hipfft.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_HIP
+#include <hipfft.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag hipfft_dso_flag;
+extern void *hipfft_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using hipfftFunc = decltype(&::__name);                        \
+      std::call_once(hipfft_dso_flag, []() {                         \
+        hipfft_dso_handle = pten::dynload::GetROCFFTDsoHandle();     \
+      });                                                            \
+      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);   \
+      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
+  __macro(hipfftPlan1d);                 \
+  __macro(hipfftPlan2d);                 \
+  __macro(hipfftPlan3d);                 \
+  __macro(hipfftPlanMany);               \
+  __macro(hipfftMakePlan1d);             \
+  __macro(hipfftMakePlanMany);           \
+  __macro(hipfftMakePlanMany64);         \
+  __macro(hipfftGetSizeMany64);          \
+  __macro(hipfftEstimate1d);             \
+  __macro(hipfftEstimate2d);             \
+  __macro(hipfftEstimate3d);             \
+  __macro(hipfftEstimateMany);           \
+  __macro(hipfftCreate);                 \
+  __macro(hipfftGetSize1d);              \
+  __macro(hipfftGetSizeMany);            \
+  __macro(hipfftGetSize);                \
+  __macro(hipfftSetWorkArea);            \
+  __macro(hipfftSetAutoAllocation);      \
+  __macro(hipfftExecC2C);                \
+  __macro(hipfftExecR2C);                \
+  __macro(hipfftExecC2R);                \
+  __macro(hipfftExecZ2Z);                \
+  __macro(hipfftExecD2Z);                \
+  __macro(hipfftExecZ2D);                \
+  __macro(hipfftSetStream);              \
+  __macro(hipfftDestroy);                \
+  __macro(hipfftGetVersion);             \
+  __macro(hipfftGetProperty);
+
+HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+
+inline const char *hipfftGetErrorString(hipfftResult_t status) {
+  switch (status) {
+    case HIPFFT_SUCCESS:
+      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
+    case HIPFFT_INVALID_PLAN:
+      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
+    case HIPFFT_ALLOC_FAILED:
+      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
+             "memory.";
+    case HIPFFT_INVALID_TYPE:
+      return "'HIPFFT_INVALID_TYPE'. No longer used.";
+    case HIPFFT_INVALID_VALUE:
+      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
+             "parameter.";
+    case HIPFFT_INTERNAL_ERROR:
+      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
+             "error.";
+    case HIPFFT_EXEC_FAILED:
+      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
+    case HIPFFT_SETUP_FAILED:
+      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
+    case HIPFFT_INVALID_SIZE:
+      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
+    case HIPFFT_UNALIGNED_DATA:
+      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
+    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
+      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
+    case HIPFFT_INVALID_DEVICE:
+      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
+             "GPU than plan creation.";
+    case HIPFFT_PARSE_ERROR:
+      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
+    case HIPFFT_NO_WORKSPACE:
+      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
+             "plan execution.";
+    case HIPFFT_NOT_IMPLEMENTED:
+      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
+             "functionality for parameters given.";
+    case HIPFFT_NOT_SUPPORTED:
+      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
+             "parameters given.";
+    default:
+      return "HIPFFT_STATUS_UNKNOWN_ERROR";
+  }
+}
+}  // namespace dynload
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/dynload/hiprand.cc b/paddle/pten/backends/dynload/hiprand.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7a5acaa112f562df4c4760028ec7605fea73c16
--- /dev/null
+++ b/paddle/pten/backends/dynload/hiprand.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/hiprand.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag hiprand_dso_flag;
+void *hiprand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/hiprand.h b/paddle/pten/backends/dynload/hiprand.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0c16da3dbc2d17143950ba1a6d662f3af396992
--- /dev/null
+++ b/paddle/pten/backends/dynload/hiprand.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <hiprand.h>
+
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/port.h"
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag hiprand_dso_flag;
+extern void *hiprand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    hiprandStatus_t operator()(Args... args) {                      \
+      using hiprandFunc = decltype(&::__name);                      \
+      std::call_once(hiprand_dso_flag, []() {                       \
+        hiprand_dso_handle = pten::dynload::GetCurandDsoHandle();   \
+      });                                                           \
+      static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
+      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);    \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+
+#define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(hiprandCreateGenerator);              \
+  __macro(hiprandSetStream);                    \
+  __macro(hiprandSetPseudoRandomGeneratorSeed); \
+  __macro(hiprandGenerateUniform);              \
+  __macro(hiprandGenerateUniformDouble);        \
+  __macro(hiprandGenerateNormal);               \
+  __macro(hiprandDestroyGenerator);
+
+HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/hiprtc.cc b/paddle/pten/backends/dynload/hiprtc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ae1e2ab10f186c715718b72e8a3549e51755058
--- /dev/null
+++ b/paddle/pten/backends/dynload/hiprtc.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/hiprtc.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag hiprtc_dso_flag;
+void* hiprtc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() {
+  std::call_once(hiprtc_dso_flag,
+                 []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
+  return hiprtc_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/hiprtc.h b/paddle/pten/backends/dynload/hiprtc.h
new file mode 100644
index 0000000000000000000000000000000000000000..76c1753e981e1da7e9a66fe31026666edf9e5696
--- /dev/null
+++ b/paddle/pten/backends/dynload/hiprtc.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hiprtc.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag hiprtc_dso_flag;
+extern void* hiprtc_dso_handle;
+extern bool HasNVRTC();
+
+#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using hiprtc_func = decltype(&::__name);                       \
+      std::call_once(hiprtc_dso_flag, []() {                         \
+        hiprtc_dso_handle = pten::dynload::GetNVRTCDsoHandle();      \
+      });                                                            \
+      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);   \
+      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);     \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed hiprtc functions
+ **/
+#define HIPRTC_ROUTINE_EACH(__macro) \
+  __macro(hiprtcVersion);            \
+  __macro(hiprtcGetErrorString);     \
+  __macro(hiprtcCompileProgram);     \
+  __macro(hiprtcCreateProgram);      \
+  __macro(hiprtcDestroyProgram);     \
+  __macro(hiprtcGetCode);            \
+  __macro(hiprtcGetCodeSize);        \
+  __macro(hiprtcGetProgramLog);      \
+  __macro(hiprtcGetProgramLogSize)
+
+HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/lapack.cc b/paddle/pten/backends/dynload/lapack.cc
new file mode 100644
index 0000000000000000000000000000000000000000..847f4528dae589d382e88582d24ddcbdeaaafe69
--- /dev/null
+++ b/paddle/pten/backends/dynload/lapack.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/lapack.h"
+#include <mutex>
+
+namespace pten {
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+LAPACK_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/lapack.h b/paddle/pten/backends/dynload/lapack.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5636850f8d67b4b2ea3328dc5c83446aea71cc3
--- /dev/null
+++ b/paddle/pten/backends/dynload/lapack.h
@@ -0,0 +1,340 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <complex>
+#include <mutex>
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+// Note(zhouwei): because lapack doesn't provide appropriate header file.
+// should expose API statement yourself.
+
+// getrf_(For example)
+extern "C" void dgetrf_(
+    int *m, int *n, double *a, int *lda, int *ipiv, int *info);
+extern "C" void sgetrf_(
+    int *m, int *n, float *a, int *lda, int *ipiv, int *info);
+
+// evd
+extern "C" void zheevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        std::complex<double> *a,
+                        int *lda,
+                        double *w,
+                        std::complex<double> *work,
+                        int *lwork,
+                        double *rwork,
+                        int *lrwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+extern "C" void cheevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        std::complex<float> *a,
+                        int *lda,
+                        float *w,
+                        std::complex<float> *work,
+                        int *lwork,
+                        float *rwork,
+                        int *lrwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+extern "C" void dsyevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        double *a,
+                        int *lda,
+                        double *w,
+                        double *work,
+                        int *lwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+extern "C" void ssyevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        float *a,
+                        int *lda,
+                        float *w,
+                        float *work,
+                        int *lwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+
+// geev
+extern "C" void dgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       double *a,
+                       int *lda,
+                       double *wr,
+                       double *wi,
+                       double *vl,
+                       int *ldvl,
+                       double *vr,
+                       int *ldvr,
+                       double *work,
+                       int *lwork,
+                       int *info);
+extern "C" void sgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       float *a,
+                       int *lda,
+                       float *wr,
+                       float *wi,
+                       float *vl,
+                       int *ldvl,
+                       float *vr,
+                       int *ldvr,
+                       float *work,
+                       int *lwork,
+                       int *info);
+extern "C" void zgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       std::complex<double> *a,
+                       int *lda,
+                       std::complex<double> *w,
+                       std::complex<double> *vl,
+                       int *ldvl,
+                       std::complex<double> *vr,
+                       int *ldvr,
+                       std::complex<double> *work,
+                       int *lwork,
+                       double *rwork,
+                       int *info);
+extern "C" void cgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       std::complex<float> *a,
+                       int *lda,
+                       std::complex<float> *w,
+                       std::complex<float> *vl,
+                       int *ldvl,
+                       std::complex<float> *vr,
+                       int *ldvr,
+                       std::complex<float> *work,
+                       int *lwork,
+                       float *rwork,
+                       int *info);
+
+// gels
+extern "C" void dgels_(char *trans,
+                       int *m,
+                       int *n,
+                       int *nrhs,
+                       double *a,
+                       int *lda,
+                       double *b,
+                       int *ldb,
+                       double *work,
+                       int *lwork,
+                       int *info);
+extern "C" void sgels_(char *trans,
+                       int *m,
+                       int *n,
+                       int *nrhs,
+                       float *a,
+                       int *lda,
+                       float *b,
+                       int *ldb,
+                       float *work,
+                       int *lwork,
+                       int *info);
+
+// gelsd
+extern "C" void dgelsd_(int *m,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        double *s,
+                        double *rcond,
+                        int *rank,
+                        double *work,
+                        int *lwork,
+                        int *iwork,
+                        int *info);
+extern "C" void sgelsd_(int *m,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        float *s,
+                        float *rcond,
+                        int *rank,
+                        float *work,
+                        int *lwork,
+                        int *iwork,
+                        int *info);
+
+// gelsy
+extern "C" void dgelsy_(int *m,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        int *jpvt,
+                        double *rcond,
+                        int *rank,
+                        double *work,
+                        int *lwork,
+                        int *info);
+extern "C" void sgelsy_(int *m,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        int *jpvt,
+                        float *rcond,
+                        int *rank,
+                        float *work,
+                        int *lwork,
+                        int *info);
+
+// gelss
+extern "C" void dgelss_(int *m,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        double *s,
+                        double *rcond,
+                        int *rank,
+                        double *work,
+                        int *lwork,
+                        int *info);
+extern "C" void sgelss_(int *m,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        float *s,
+                        float *rcond,
+                        int *rank,
+                        float *work,
+                        int *lwork,
+                        int *info);
+
+extern "C" void zpotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        std::complex<double> *a,
+                        int *lda,
+                        std::complex<double> *b,
+                        int *ldb,
+                        int *info);
+extern "C" void cpotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        std::complex<float> *a,
+                        int *lda,
+                        std::complex<float> *b,
+                        int *ldb,
+                        int *info);
+extern "C" void dpotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        int *info);
+extern "C" void spotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        int *info);
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag lapack_dso_flag;
+extern void *lapack_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                             \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using lapackFunc = decltype(&::__name);                        \
+      std::call_once(lapack_dso_flag, []() {                         \
+        lapack_dso_handle = pten::dynload::GetLAPACKDsoHandle();     \
+      });                                                            \
+      static void *p_##_name = dlsym(lapack_dso_handle, #__name);    \
+      return reinterpret_cast<lapackFunc>(p_##_name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
+  DYNAMIC_LOAD_LAPACK_WRAP(__name)
+
+#define LAPACK_ROUTINE_EACH(__macro) \
+  __macro(dgetrf_);                  \
+  __macro(sgetrf_);                  \
+  __macro(zheevd_);                  \
+  __macro(cheevd_);                  \
+  __macro(dsyevd_);                  \
+  __macro(ssyevd_);                  \
+  __macro(dgeev_);                   \
+  __macro(sgeev_);                   \
+  __macro(zgeev_);                   \
+  __macro(cgeev_);                   \
+  __macro(dgels_);                   \
+  __macro(sgels_);                   \
+  __macro(dgelsd_);                  \
+  __macro(sgelsd_);                  \
+  __macro(dgelsy_);                  \
+  __macro(sgelsy_);                  \
+  __macro(dgelss_);                  \
+  __macro(sgelss_);                  \
+  __macro(zpotrs_);                  \
+  __macro(cpotrs_);                  \
+  __macro(dpotrs_);                  \
+  __macro(spotrs_);
+
+LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
+
+#undef DYNAMIC_LOAD_LAPACK_WRAP
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/miopen.cc b/paddle/pten/backends/dynload/miopen.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eaa97f6d85041e5368b3fc45f4c0df45411067e1
--- /dev/null
+++ b/paddle/pten/backends/dynload/miopen.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag miopen_dso_flag;
+void* miopen_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MIOPEN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+MIOPEN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R3
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R4
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R5
+MIOPEN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R6
+MIOPEN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R7
+MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R7
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
+#endif
+
+bool HasCUDNN() {
+  std::call_once(miopen_dso_flag,
+                 []() { miopen_dso_handle = GetCUDNNDsoHandle(); });
+  return miopen_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      miopen_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load miopen shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/miopen.h b/paddle/pten/backends/dynload/miopen.h
new file mode 100644
index 0000000000000000000000000000000000000000..9868953fc2f664f67ca276273d952d1f0c741c39
--- /dev/null
+++ b/paddle/pten/backends/dynload/miopen.h
@@ -0,0 +1,196 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <glog/logging.h>
+
+#include <miopen/miopen.h>
+#include <miopen/version.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+#define MIOPEN_VERSION                                       \
+  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
+   MIOPEN_VERSION_PATCH)  // NOLINT
+
+// MIOPEN only support NCHW, just for compatibility with CUDNN API
+typedef enum {
+  MIOPEN_TENSOR_NCHW = 0,
+  MIOPEN_TENSOR_NHWC = 1,
+} miopenTensorFormat_t;
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag miopen_dso_flag;
+extern void* miopen_dso_handle;
+extern bool HasCUDNN();
+
+inline const char* miopenGetErrorString(miopenStatus_t status) {
+  switch (status) {
+    case miopenStatusSuccess:
+      return "MIOPEN_STATUS_SUCCESS";
+    case miopenStatusNotInitialized:
+      return "MIOPEN_STATUS_NOT_INITIALIZED";
+    case miopenStatusInvalidValue:
+      return "MIOPEN_STATUS_INVALID_VALUE";
+    case miopenStatusBadParm:
+      return "MIOPEN_STATUS_BAD_PARAM";
+    case miopenStatusAllocFailed:
+      return "MIOPEN_STATUS_ALLOC_FAILED";
+    case miopenStatusInternalError:
+      return "MIOPEN_STATUS_INTERNAL_ERROR";
+    case miopenStatusNotImplemented:
+      return "MIOPEN_STATUS_NOT_IMPLEMENTED";
+    case miopenStatusUnsupportedOp:
+      return "MIOPEN_STATUS_UNSUPPORTED_OP";
+    case miopenStatusUnknownError:
+    default:
+      return "MIOPEN_STATUS_UNKNOWN_ERROR";
+  }
+}
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using miopen_func = decltype(&::__name);                       \
+      std::call_once(miopen_dso_flag, []() {                         \
+        miopen_dso_handle = pten::dynload::GetCUDNNDsoHandle();      \
+      });                                                            \
+      EnforceCUDNNLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(miopen_dso_handle, #__name);   \
+      return reinterpret_cast<miopen_func>(p_##__name)(args...);     \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed miopen functions in HPPL
+ **/
+#define MIOPEN_DNN_ROUTINE_EACH(__macro)                  \
+  __macro(miopenGetVersion);                              \
+  __macro(miopenOpTensor);                                \
+  __macro(miopenSet4dTensorDescriptor);                   \
+  __macro(miopenSetTensorDescriptor);                     \
+  __macro(miopenInitConvolutionNdDescriptor);             \
+  __macro(miopenFindConvolutionForwardAlgorithm);         \
+  __macro(miopenGetConvolutionNdForwardOutputDim);        \
+  __macro(miopenFindConvolutionBackwardDataAlgorithm);    \
+  __macro(miopenFindConvolutionBackwardWeightsAlgorithm); \
+  __macro(miopenGetTensorDescriptor);                     \
+  __macro(miopenCreateTensorDescriptor);                  \
+  __macro(miopenDestroyTensorDescriptor);                 \
+  __macro(miopenGetTensorDescriptorSize);                 \
+  __macro(miopenSet2dPoolingDescriptor);                  \
+  __macro(miopenGet2dPoolingDescriptor);                  \
+  __macro(miopenGetPoolingNdForwardOutputDim);            \
+  __macro(miopenCreateConvolutionDescriptor);             \
+  __macro(miopenCreatePoolingDescriptor);                 \
+  __macro(miopenDestroyPoolingDescriptor);                \
+  __macro(miopenPoolingGetWorkSpaceSize);                 \
+  __macro(miopenPoolingGetWorkSpaceSizeV2);               \
+  __macro(miopenSetNdPoolingDescriptor);                  \
+  __macro(miopenInitConvolutionDescriptor);               \
+  __macro(miopenDestroyConvolutionDescriptor);            \
+  __macro(miopenGetConvolutionNdDescriptor);              \
+  __macro(miopenDeriveBNTensorDescriptor);                \
+  __macro(miopenCreate);                                  \
+  __macro(miopenDestroy);                                 \
+  __macro(miopenSetStream);                               \
+  __macro(miopenActivationForward);                       \
+  __macro(miopenActivationBackward);                      \
+  __macro(miopenConvolutionBackwardWeights);              \
+  __macro(miopenConvolutionForward);                      \
+  __macro(miopenConvolutionForwardBias);                  \
+  __macro(miopenConvolutionBackwardBias);                 \
+  __macro(miopenConvolutionForwardGetWorkSpaceSize);      \
+  __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \
+  __macro(miopenTransformTensor);                         \
+  __macro(miopenPoolingForward);                          \
+  __macro(miopenPoolingBackward);                         \
+  __macro(miopenSoftmaxBackward);                         \
+  __macro(miopenSoftmaxBackward_V2);                      \
+  __macro(miopenSoftmaxForward);                          \
+  __macro(miopenSoftmaxForward_V2);                       \
+  __macro(miopenCreateDropoutDescriptor);                 \
+  __macro(miopenDestroyDropoutDescriptor);                \
+  __macro(miopenRestoreDropoutDescriptor);                \
+  __macro(miopenDropoutGetStatesSize);                    \
+  __macro(miopenSetDropoutDescriptor);                    \
+  __macro(miopenCreateRNNDescriptor);                     \
+  __macro(miopenDestroyRNNDescriptor);                    \
+  __macro(miopenSetRNNDescriptor);                        \
+  __macro(miopenSetRNNDescriptor_V2);                     \
+  __macro(miopenGetRNNParamsSize);                        \
+  __macro(miopenGetRNNWorkspaceSize);                     \
+  __macro(miopenGetRNNTrainingReserveSize);               \
+  __macro(miopenRNNForwardTraining);                      \
+  __macro(miopenRNNBackwardData);                         \
+  __macro(miopenRNNBackwardWeights);                      \
+  __macro(miopenRNNForwardInference);                     \
+  __macro(miopenGetTensorNumBytes);
+
+MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(miopenConvolutionBackwardData);
+MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs available after R3:
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
+  __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs available after R4:
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(miopenBatchNormalizationForwardTraining);  \
+  __macro(miopenBatchNormalizationForwardInference); \
+  __macro(miopenBatchNormalizationBackward);
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs in R5
+#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(miopenCreateActivationDescriptor); \
+  __macro(miopenSetActivationDescriptor);    \
+  __macro(miopenGetActivationDescriptor);    \
+  __macro(miopenDestroyActivationDescriptor);
+MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs in R6
+#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
+/*__macro(miopenSetRNNDescriptor_v6);*/
+MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(miopenSetConvolutionGroupCount);  \
+  __macro(miopenCreateCTCLossDescriptor);   \
+  __macro(miopenDestroyCTCLossDescriptor);  \
+  __macro(miopenGetCTCLossDescriptor);      \
+  __macro(miopenSetCTCLossDescriptor);      \
+  __macro(miopenGetCTCLossWorkspaceSize);   \
+  __macro(miopenCTCLoss);
+MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                    \
+/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
+__macro(cudnnBatchNormalizationForwardTrainingEx);                   \
+__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);          \
+__macro(cudnnBatchNormalizationBackwardEx);                          \
+__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/mklml.cc b/paddle/pten/backends/dynload/mklml.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfa1491f02709aea04f73ed1035bb7039d972851
--- /dev/null
+++ b/paddle/pten/backends/dynload/mklml.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/mklml.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag mklml_dso_flag;
+void* mklml_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLML_ROUTINE_EACH(DEFINE_WRAP);
+
+#if !defined(_WIN32)
+DEFINE_WRAP(mkl_scsrmm);
+DEFINE_WRAP(mkl_dcsrmm);
+#endif
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/mklml.h b/paddle/pten/backends/dynload/mklml.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8baca6aecdd1c79aaa2c6cfa8b6997b96dbf210
--- /dev/null
+++ b/paddle/pten/backends/dynload/mklml.h
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkl.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag mklml_dso_flag;
+extern void *mklml_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mklml routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                              \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mklmlFunc = decltype(&::__name);                         \
+      std::call_once(mklml_dso_flag, []() {                          \
+        mklml_dso_handle = pten::dynload::GetMKLMLDsoHandle();       \
+      });                                                            \
+      static void *p_##_name = dlsym(mklml_dso_handle, #__name);     \
+      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);        \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+
+#define MKLML_ROUTINE_EACH(__macro) \
+  __macro(cblas_sgemm);             \
+  __macro(cblas_dgemm);             \
+  __macro(cblas_cgemm);             \
+  __macro(cblas_zgemm);             \
+  __macro(cblas_saxpy);             \
+  __macro(cblas_daxpy);             \
+  __macro(cblas_caxpy);             \
+  __macro(cblas_zaxpy);             \
+  __macro(cblas_scopy);             \
+  __macro(cblas_dcopy);             \
+  __macro(cblas_ccopy);             \
+  __macro(cblas_zcopy);             \
+  __macro(cblas_sgemv);             \
+  __macro(cblas_dgemv);             \
+  __macro(cblas_cgemv);             \
+  __macro(cblas_zgemv);             \
+  __macro(cblas_strsm);             \
+  __macro(cblas_dtrsm);             \
+  __macro(cblas_ctrsm);             \
+  __macro(cblas_ztrsm);             \
+  __macro(cblas_sgemm_alloc);       \
+  __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
+  __macro(cblas_dgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
+  __macro(cblas_dgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
+  __macro(cblas_dgemm_free);        \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm_batch);       \
+  __macro(cblas_cgemm_batch);       \
+  __macro(cblas_zgemm_batch);       \
+  __macro(cblas_sdot);              \
+  __macro(cblas_ddot);              \
+  __macro(cblas_sasum);             \
+  __macro(cblas_dasum);             \
+  __macro(cblas_isamax);            \
+  __macro(cblas_idamax);            \
+  __macro(cblas_sscal);             \
+  __macro(cblas_dscal);             \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(vsSub);                   \
+  __macro(vdSub);                   \
+  __macro(vsMul);                   \
+  __macro(vdMul);                   \
+  __macro(vsDiv);                   \
+  __macro(vdDiv);                   \
+  __macro(vsExp);                   \
+  __macro(vdExp);                   \
+  __macro(vsSqr);                   \
+  __macro(vdSqr);                   \
+  __macro(vsPowx);                  \
+  __macro(vdPowx);                  \
+  __macro(vsInv);                   \
+  __macro(vdInv);                   \
+  __macro(vmsErf);                  \
+  __macro(vmdErf);                  \
+  __macro(MKL_Free_Buffers);        \
+  __macro(MKL_Set_Num_Threads);     \
+  __macro(MKL_Get_Max_Threads);
+
+MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+
+#if !defined(_WIN32)
+DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
+DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
+#endif
+
+#undef DYNAMIC_LOAD_MKLML_WRAP
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/mklrt.cc b/paddle/pten/backends/dynload/mklrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27c544ff25ff74fad6d567d42feabe36609b1b3c
--- /dev/null
+++ b/paddle/pten/backends/dynload/mklrt.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/mklrt.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag mklrt_dso_flag;
+void* mklrt_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLDFTI_ROUTINE_EACH(DEFINE_WRAP);
+
+DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
+                                           enum DFTI_CONFIG_VALUE prec,
+                                           enum DFTI_CONFIG_VALUE domain,
+                                           MKL_LONG dim,
+                                           MKL_LONG* sizes) {
+  if (prec == DFTI_SINGLE) {
+    if (dim == 1) {
+      return DftiCreateDescriptor_s_1d(desc, domain, sizes[0]);
+    } else {
+      return DftiCreateDescriptor_s_md(desc, domain, dim, sizes);
+    }
+  } else if (prec == DFTI_DOUBLE) {
+    if (dim == 1) {
+      return DftiCreateDescriptor_d_1d(desc, domain, sizes[0]);
+    } else {
+      return DftiCreateDescriptor_d_md(desc, domain, dim, sizes);
+    }
+  } else {
+    return DftiCreateDescriptor(desc, prec, domain, dim, sizes);
+  }
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/mklrt.h b/paddle/pten/backends/dynload/mklrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe87b170a1c96c5c1f18e58b086e427ecb1422c2
--- /dev/null
+++ b/paddle/pten/backends/dynload/mklrt.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkl_dfti.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag mklrt_dso_flag;
+extern void* mklrt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mkldfti routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                              \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mklrtFunc = decltype(&::__name);                         \
+      std::call_once(mklrt_dso_flag, []() {                          \
+        mklrt_dso_handle = pten::dynload::GetMKLRTDsoHandle();       \
+      });                                                            \
+      static void* p_##__name = dlsym(mklrt_dso_handle, #__name);    \
+      return reinterpret_cast<mklrtFunc>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+// mkl_dfti.h has a macro that shadows the function with the same name
+// un-defeine this macro so as to export that function
+#undef DftiCreateDescriptor
+
+#define MKLDFTI_ROUTINE_EACH(__macro) \
+  __macro(DftiCreateDescriptor);      \
+  __macro(DftiCreateDescriptor_s_1d); \
+  __macro(DftiCreateDescriptor_d_1d); \
+  __macro(DftiCreateDescriptor_s_md); \
+  __macro(DftiCreateDescriptor_d_md); \
+  __macro(DftiSetValue);              \
+  __macro(DftiGetValue);              \
+  __macro(DftiCommitDescriptor);      \
+  __macro(DftiComputeForward);        \
+  __macro(DftiComputeBackward);       \
+  __macro(DftiFreeDescriptor);        \
+  __macro(DftiErrorClass);            \
+  __macro(DftiErrorMessage);
+
+MKLDFTI_ROUTINE_EACH(DYNAMIC_LOAD_MKLRT_WRAP)
+
+#undef DYNAMIC_LOAD_MKLRT_WRAP
+
+// define another function to avoid naming conflict
+DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
+                                           enum DFTI_CONFIG_VALUE prec,
+                                           enum DFTI_CONFIG_VALUE domain,
+                                           MKL_LONG dim,
+                                           MKL_LONG* sizes);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/nccl.cc b/paddle/pten/backends/dynload/nccl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d393213a0e1a861d3e7ddb39f8c3e3444ec91d9
--- /dev/null
+++ b/paddle/pten/backends/dynload/nccl.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/nccl.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nccl_dso_flag;
+void *nccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if NCCL_VERSION_CODE >= 2212
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2304
+NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 21100
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/nccl.h b/paddle/pten/backends/dynload/nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a062fbf08ab5b6b005e61ad8f26cbfe56dd8f3
--- /dev/null
+++ b/paddle/pten/backends/dynload/nccl.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <nccl.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag nccl_dso_flag;
+extern void* nccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(&::__name);                     \
+      std::call_once(nccl_dso_flag, []() {                       \
+        nccl_dso_handle = pten::dynload::GetNCCLDsoHandle();     \
+      });                                                        \
+      static void* p_##__name = dlsym(nccl_dso_handle, #__name); \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define NCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
+  __macro(ncclReduce);                  \
+  __macro(ncclReduceScatter);           \
+  __macro(ncclGetErrorString);
+
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+
+#if NCCL_VERSION_CODE >= 2212
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2304
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
+NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(ncclSend);                               \
+  __macro(ncclRecv);
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 21100
+#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
+  __macro(ncclRedOpCreatePreMulSum);                \
+  __macro(ncclRedOpDestroy);
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/nvjpeg.cc b/paddle/pten/backends/dynload/nvjpeg.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea385e282fc4bb16d94085ef12223d328ad34499
--- /dev/null
+++ b/paddle/pten/backends/dynload/nvjpeg.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/nvjpeg.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nvjpeg_dso_flag;
+void *nvjpeg_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/nvjpeg.h b/paddle/pten/backends/dynload/nvjpeg.h
new file mode 100644
index 0000000000000000000000000000000000000000..9abcfaee6ed3d774b0982d8591ad89878d4ff3bd
--- /dev/null
+++ b/paddle/pten/backends/dynload/nvjpeg.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <nvjpeg.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag nvjpeg_dso_flag;
+extern void *nvjpeg_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                   \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    nvjpegStatus_t operator()(Args... args) {                      \
+      using nvjpegFunc = decltype(&::__name);                      \
+      std::call_once(nvjpeg_dso_flag, []() {                       \
+        nvjpeg_dso_handle = pten::dynload::GetNvjpegDsoHandle();   \
+      });                                                          \
+      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
+      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+
+#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
+  __macro(nvjpegCreateSimple);            \
+  __macro(nvjpegJpegStateCreate);         \
+  __macro(nvjpegGetImageInfo);            \
+  __macro(nvjpegJpegStateDestroy);        \
+  __macro(nvjpegDecode);
+
+NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/dynload/nvrtc.cc b/paddle/pten/backends/dynload/nvrtc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e86d943a249cb79c3dc85a570f90e2c8ed69abf7
--- /dev/null
+++ b/paddle/pten/backends/dynload/nvrtc.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/nvrtc.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nvrtc_dso_flag;
+void* nvrtc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVRTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() {
+  std::call_once(nvrtc_dso_flag,
+                 []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
+  return nvrtc_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/nvrtc.h b/paddle/pten/backends/dynload/nvrtc.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dcb1142d58fe73b534432202ef3d44467113b3f
--- /dev/null
+++ b/paddle/pten/backends/dynload/nvrtc.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <nvrtc.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag nvrtc_dso_flag;
+extern void* nvrtc_dso_handle;
+extern bool HasNVRTC();
+
+#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using nvrtc_func = decltype(&::__name);                        \
+      std::call_once(nvrtc_dso_flag, []() {                          \
+        nvrtc_dso_handle = pten::dynload::GetNVRTCDsoHandle();       \
+      });                                                            \
+      static void* p_##__name = dlsym(nvrtc_dso_handle, #__name);    \
+      return reinterpret_cast<nvrtc_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed nvrtc functions
+ **/
+#define NVRTC_ROUTINE_EACH(__macro) \
+  __macro(nvrtcVersion);            \
+  __macro(nvrtcGetErrorString);     \
+  __macro(nvrtcCompileProgram);     \
+  __macro(nvrtcCreateProgram);      \
+  __macro(nvrtcDestroyProgram);     \
+  __macro(nvrtcGetPTX);             \
+  __macro(nvrtcGetPTXSize);         \
+  __macro(nvrtcGetProgramLog);      \
+  __macro(nvrtcGetProgramLogSize)
+
+NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/nvtx.cc b/paddle/pten/backends/dynload/nvtx.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d248ff2de09fb55ba7930292c4cfa3100062420
--- /dev/null
+++ b/paddle/pten/backends/dynload/nvtx.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef _WIN32
+#include "paddle/pten/backends/dynload/nvtx.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nvtx_dso_flag;
+void *nvtx_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVTX_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
+#endif
diff --git a/paddle/pten/backends/dynload/nvtx.h b/paddle/pten/backends/dynload/nvtx.h
new file mode 100644
index 0000000000000000000000000000000000000000..98c2d539339c8787072324bc3b79dce8aa3e9511
--- /dev/null
+++ b/paddle/pten/backends/dynload/nvtx.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef _WIN32
+#include <cuda.h>
+#include <nvToolsExt.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag nvtx_dso_flag;
+extern void *nvtx_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    int operator()(Args... args) {                               \
+      using nvtxFunc = decltype(&::__name);                      \
+      std::call_once(nvtx_dso_flag, []() {                       \
+        nvtx_dso_handle = pten::dynload::GetNvtxDsoHandle();     \
+      });                                                        \
+      static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
+      return reinterpret_cast<nvtxFunc>(p_##__name)(args...);    \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define NVTX_ROUTINE_EACH(__macro) \
+  __macro(nvtxRangePushA);         \
+  __macro(nvtxRangePop);
+
+NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
+}  // namespace dynload
+}  // namespace pten
+#endif
diff --git a/paddle/fluid/platform/port.h b/paddle/pten/backends/dynload/port.h
similarity index 100%
rename from paddle/fluid/platform/port.h
rename to paddle/pten/backends/dynload/port.h
diff --git a/paddle/pten/backends/dynload/rccl.cc b/paddle/pten/backends/dynload/rccl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46bbaea13625352727878932acf40d5caab8562b
--- /dev/null
+++ b/paddle/pten/backends/dynload/rccl.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/rccl.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag rccl_dso_flag;
+void *rccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if NCCL_VERSION_CODE >= 2212
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/rccl.h b/paddle/pten/backends/dynload/rccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..370bab6658f2a56af8667c58289a0ee17dca5893
--- /dev/null
+++ b/paddle/pten/backends/dynload/rccl.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <rccl.h>
+
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag rccl_dso_flag;
+extern void* rccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(&::__name);                     \
+      std::call_once(rccl_dso_flag, []() {                       \
+        rccl_dso_handle = pten::dynload::GetNCCLDsoHandle();     \
+      });                                                        \
+      static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define RCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
+  __macro(ncclReduce);                  \
+  __macro(ncclReduceScatter);           \
+  __macro(ncclGetErrorString);
+
+RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+
+#if NCCL_VERSION_CODE >= 2212
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(ncclSend);                               \
+  __macro(ncclRecv);
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/rocblas.cc b/paddle/pten/backends/dynload/rocblas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b7d4469f38979f874f007ab5823f324dede1c65
--- /dev/null
+++ b/paddle/pten/backends/dynload/rocblas.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/rocblas.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag rocblas_dso_flag;
+void *rocblas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+ROCBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R2
+ROCBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R3
+ROCBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R4
+ROCBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/rocblas.h b/paddle/pten/backends/dynload/rocblas.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb7e6d58e1b3e2e0ac31a8aed6abc3f9334f0ce4
--- /dev/null
+++ b/paddle/pten/backends/dynload/rocblas.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <rocblas.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag rocblas_dso_flag;
+extern void *rocblas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)                   \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    rocblas_status operator()(Args... args) {                       \
+      using rocblas_func = decltype(&::__name);                     \
+      std::call_once(rocblas_dso_flag, []() {                       \
+        rocblas_dso_handle = pten::dynload::GetCublasDsoHandle();   \
+      });                                                           \
+      static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
+      return reinterpret_cast<rocblas_func>(p_##__name)(args...);   \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+
+#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(rocblas_caxpy);                  \
+  __macro(rocblas_saxpy);                  \
+  __macro(rocblas_daxpy);                  \
+  __macro(rocblas_zaxpy);                  \
+  __macro(rocblas_sscal);                  \
+  __macro(rocblas_dscal);                  \
+  __macro(rocblas_scopy);                  \
+  __macro(rocblas_dcopy);                  \
+  __macro(rocblas_cgemv);                  \
+  __macro(rocblas_sgemv);                  \
+  __macro(rocblas_zgemv);                  \
+  __macro(rocblas_dgemv);                  \
+  __macro(rocblas_cgemm);                  \
+  __macro(rocblas_sgemm);                  \
+  __macro(rocblas_dgemm);                  \
+  __macro(rocblas_hgemm);                  \
+  __macro(rocblas_zgemm);                  \
+  __macro(rocblas_sgeam);                  \
+  __macro(rocblas_strsm);                  \
+  __macro(rocblas_dtrsm);                  \
+  __macro(rocblas_dgeam);                  \
+  __macro(rocblas_sgemm_batched);          \
+  __macro(rocblas_dgemm_batched);          \
+  __macro(rocblas_cgemm_batched);          \
+  __macro(rocblas_zgemm_batched);          \
+  __macro(rocblas_create_handle);          \
+  __macro(rocblas_destroy_handle);         \
+  __macro(rocblas_set_stream);             \
+  __macro(rocblas_get_stream);             \
+  __macro(rocblas_set_pointer_mode);       \
+  __macro(rocblas_get_pointer_mode);
+
+ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+// APIs available after CUDA 8.0
+#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(rocblas_gemm_ex);                   \
+  __macro(rocblas_sgemm_strided_batched);     \
+  __macro(rocblas_dgemm_strided_batched);     \
+  __macro(rocblas_cgemm_strided_batched);     \
+  __macro(rocblas_zgemm_strided_batched);     \
+  __macro(rocblas_hgemm_strided_batched);
+
+ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+// HIP not supported in ROCM3.5
+// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
+//   __macro(cublasSetMathMode);
+//   __macro(cublasGetMathMode);
+// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(rocblas_gemm_batched_ex);           \
+  __macro(rocblas_gemm_strided_batched_ex);
+
+ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/rocm_driver.cc b/paddle/pten/backends/dynload/rocm_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51d55e80a725ece8d662a55eb427be3734667711
--- /dev/null
+++ b/paddle/pten/backends/dynload/rocm_driver.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/rocm_driver.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag rocm_dso_flag;
+void* rocm_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+ROCM_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() {
+  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
+  return rocm_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/rocm_driver.h b/paddle/pten/backends/dynload/rocm_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcc6b7f037cc9d0999c22d33377a6c8de61b5957
--- /dev/null
+++ b/paddle/pten/backends/dynload/rocm_driver.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag rocm_dso_flag;
+extern void* rocm_dso_handle;
+extern bool HasCUDADriver();
+
+#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using rocm_func = decltype(&::__name);                         \
+      std::call_once(rocm_dso_flag, []() {                           \
+        rocm_dso_handle = pten::dynload::GetCUDADsoHandle();         \
+      });                                                            \
+      static void* p_##__name = dlsym(rocm_dso_handle, #__name);     \
+      return reinterpret_cast<rocm_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cuda driver functions
+ **/
+#define ROCM_ROUTINE_EACH(__macro)                            \
+  __macro(hipDriverGetVersion);                               \
+  __macro(hipGetErrorString);                                 \
+  __macro(hipModuleLoadData);                                 \
+  __macro(hipModuleGetFunction);                              \
+  __macro(hipModuleUnload);                                   \
+  /*rocm3.5 not support the function*/                        \
+  /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
+  __macro(hipModuleLaunchKernel);                             \
+  __macro(hipLaunchKernel);                                   \
+  __macro(hipGetDevice);                                      \
+  __macro(hipGetDeviceCount);                                 \
+  __macro(hipDevicePrimaryCtxGetState)
+
+ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/tensorrt.cc b/paddle/pten/backends/dynload/tensorrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..680dad5289018ae8374a377afebf39b91ca8cc41
--- /dev/null
+++ b/paddle/pten/backends/dynload/tensorrt.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/pten/backends/dynload/tensorrt.h"
+#include <string>
+
+namespace pten {
+namespace dynload {
+
+std::once_flag tensorrt_dso_flag;
+void* tensorrt_dso_handle;
+
+std::once_flag tensorrt_plugin_dso_flag;
+void* tensorrt_plugin_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP);
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+void* GetDsoHandle(const std::string& dso_name) {
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+
+  void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
+
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "You are using Paddle compiled with TensorRT, but TensorRT dynamic "
+        "library is not found. Ignore this if TensorRT is not needed.\n"
+        "The TensorRT that Paddle depends on is not configured correctly.\n"
+        "  Suggestions:\n"
+        "  1. Check if the TensorRT is installed correctly and its version"
+        " is matched with paddlepaddle you installed.\n"
+        "  2. Configure environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...`\n";
+    LOG(WARNING) << error_msg;
+  }
+  return dso_handle;
+}
+
+void* GetTensorRtHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer.dll";
+#else
+  std::string dso_name = "libnvinfer.so";
+#endif
+  return GetDsoHandle(dso_name);
+}
+
+void* GetTensorRtPluginHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer_plugin.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer_plugin.dll";
+#else
+  std::string dso_name = "libnvinfer_plugin.so";
+#endif
+  return GetDsoHandle(dso_name);
+}
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/tensorrt.h b/paddle/pten/backends/dynload/tensorrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed710085acd658d829b813f4e7841158299bb94b
--- /dev/null
+++ b/paddle/pten/backends/dynload/tensorrt.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferPlugin.h>
+#if !defined(_WIN32)
+#include <dlfcn.h>
+#endif
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+
+namespace pten {
+namespace dynload {
+
+void* GetTensorRtHandle();
+
+extern std::once_flag tensorrt_dso_flag;
+extern void* tensorrt_dso_handle;
+
+void* GetTensorRtPluginHandle();
+extern std::once_flag tensorrt_plugin_dso_flag;
+extern void* tensorrt_plugin_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)             \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    void* operator()(Args... args) {                                   \
+      std::call_once(tensorrt_dso_flag, []() {                         \
+        tensorrt_dso_handle = pten::dynload::GetTensorRtHandle();      \
+      });                                                              \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);   \
+      if (p_##__name == nullptr) {                                     \
+        return nullptr;                                                \
+      }                                                                \
+      using tensorrt_func = decltype(&::__name);                       \
+      auto ret = reinterpret_cast<tensorrt_func>(p_##__name)(args...); \
+      return static_cast<void*>(ret);                                  \
+    }                                                                  \
+  };                                                                   \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)              \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
+      std::call_once(tensorrt_dso_flag, []() {                              \
+        tensorrt_dso_handle = pten::dynload::GetTensorRtHandle();           \
+      });                                                                   \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);        \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                   \
+                              paddle::platform::errors::Unavailable(        \
+                                  "Load tensorrt api %s failed", #__name)); \
+      using tensorrt_func = decltype(&::__name);                            \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);          \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
+      std::call_once(tensorrt_plugin_dso_flag, []() {                          \
+        tensorrt_plugin_dso_handle = pten::dynload::GetTensorRtPluginHandle(); \
+      });                                                                      \
+      static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name);    \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
+                              paddle::platform::errors::Unavailable(           \
+                                  "Load tensorrt plugin %s failed", #__name)); \
+      using tensorrt_plugin_func = decltype(&::__name);                        \
+      return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...);      \
+    }                                                                          \
+  };                                                                           \
+  extern DynLoad__##__name __name
+
+#ifdef NV_TENSORRT_MAJOR
+
+#if (NV_TENSORRT_MAJOR >= 6)
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);             \
+  __macro(getPluginRegistry);
+#else
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);
+#endif
+
+#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \
+  __macro(getInferLibVersion);
+
+#define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
+  __macro(initLibNvInferPlugins);
+
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
+
+#endif  // end of NV_TENSORRT_MAJOR
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/warpctc.cc b/paddle/pten/backends/dynload/warpctc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c34b016732a29979602e210027f80a005325ae7
--- /dev/null
+++ b/paddle/pten/backends/dynload/warpctc.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/warpctc.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/backends/dynload/warpctc.h b/paddle/pten/backends/dynload/warpctc.h
new file mode 100644
index 0000000000000000000000000000000000000000..2852293a686d4af6a82e42872a7a02ca9e48571b
--- /dev/null
+++ b/paddle/pten/backends/dynload/warpctc.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+#include "warpctc/include/ctc.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag warpctc_dso_flag;
+extern void* warpctc_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using warpctcFunc = decltype(&::__name);                       \
+      std::call_once(warpctc_dso_flag, []() {                        \
+        warpctc_dso_handle = pten::dynload::GetWarpCTCDsoHandle();   \
+      });                                                            \
+      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);   \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
+  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
+
+#define WARPCTC_ROUTINE_EACH(__macro) \
+  __macro(get_warpctc_version);       \
+  __macro(ctcGetStatusString);        \
+  __macro(compute_ctc_loss);          \
+  __macro(compute_ctc_loss_double);   \
+  __macro(get_workspace_size);        \
+  __macro(get_workspace_size_double)
+
+WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+}  // namespace dynload
+}  // namespace pten
diff --git a/paddle/pten/common/scalar.h b/paddle/pten/common/scalar.h
index 36205a0e4c2f9816f1e51a514142e4c61dc0bd62..5c8fb04633088a0f9bc53877e1ab7bddf1f073ad 100644
--- a/paddle/pten/common/scalar.h
+++ b/paddle/pten/common/scalar.h
@@ -133,9 +133,6 @@ class ScalarBase {
       case DataType::INT8:
         data_.i8 = tensor.template data<int8_t>()[0];
         break;
-      case DataType::UINT16:
-        data_.ui16 = tensor.template data<uint16_t>()[0];
-        break;
       case DataType::UINT8:
         data_.ui8 = tensor.template data<uint8_t>()[0];
         break;
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 181012732fa35b58cfffcd932926691375453a5d..b281f95f36bbd9100987a7d92c03822131ff0200 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -23,12 +23,6 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
-cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-if(WITH_GPU)
-  nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-elseif(WITH_ROCM)
-  hip_test(dim_test SRCS dim_test.cu DEPS ddim)
-endif()
 
 cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector enforce ddim)
 
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
index 2647490c9f58ba9f4e905654c70e5a30a73cdebc..75d42c4fd15cb13e10c86dd1f1b42700a53b83bd 100644
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -15,46 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <functional>
 #include "paddle/fluid/platform/place.h"
-#include "paddle/pten/core/candidate/allocator.h"
 
 namespace pten {
-namespace deprecated {
 
-/// \brief Encapsulates strategies for access/addressing, allocation/
-/// deallocation and construction/destruction of objects.
-class RawAllocator {
- public:
-  using Place = paddle::platform::Place;
-
-  /// \brief Default destructor.
-  virtual ~RawAllocator() = default;
-
-  /// \brief Allocates storage suitable for an array object of n bytes
-  /// and creates the array, but does not construct array elements.
-  /// May throw exceptions.
-  /// \param bytes_size The number of bytes to allocate.
-  /// \return The first address allocated.
-  virtual void* Allocate(size_t bytes_size) = 0;
-
-  /// \brief Deallocates storage pointed to ptr, which must be a value
-  /// returned by a previous call to allocate that has not been
-  /// invalidated by an intervening call to deallocate. The bytes_size
-  /// must match the value previously passed to allocate.
-  /// \param ptr The first address to deallocate.
-  /// \param bytes_size The number of bytes to deallocate.
-  virtual void Deallocate(void* ptr, size_t bytes_size) = 0;
-
-  /// \brief Get the place value of the allocator and the allocation.
-  /// \return The place value of the allocator and the allocation.
-  virtual const Place& place() const = 0;
-};
-
-/// \brief Fancy pointer with context. The use of this data type
+/// \brief Fancy pointer with deleter. The use of this data type
 /// is to be compatible with allocators from different frameworks
 /// without significant performance loss. This class does not
 /// support being inherited.
-class Allocation final {
+class Allocation {
  public:
   using Place = paddle::platform::Place;
   using DeleterFnPtr = void (*)(Allocation*);
@@ -62,63 +32,54 @@ class Allocation final {
   Allocation() = default;
 
   // Don't own resources, only provide access.
-  Allocation(void* data, const Place& place) : data_(data), place_(place) {}
+  Allocation(void* data, size_t size, const Place& place)
+      : ptr_(data), size_(size), place_(place) {}
 
   // Own resources.
-  Allocation(void* data, void* ctx, DeleterFnPtr deleter, const Place& place)
-      : data_(data), ctx_(ctx), deleter_(deleter), place_(place) {}
+  Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place)
+      : ptr_(data), size_(size), deleter_(deleter), place_(place) {}
 
-  Allocation(Allocation&& other) { swap(*this, other); }
-  Allocation& operator=(Allocation&& other) {
+  Allocation(Allocation&& other) noexcept { swap(*this, other); }
+  Allocation& operator=(Allocation&& other) noexcept {
     // Exchange them explicitly to avoid moving is equivalent
     // to copying.
     swap(*this, other);
     return *this;
   }
-  ~Allocation() { Clear(); }
 
-  void* ptr() const noexcept { return data_; }
-  void* operator->() const noexcept { return data_; }
-  operator bool() const noexcept { return data_ || ctx_; }
-  const Place& place() const noexcept { return place_; }
-
-  void Clear() {
+  virtual ~Allocation() {
     if (deleter_) {
       deleter_(this);
     }
-    ctx_ = nullptr;
-    deleter_ = nullptr;
-    data_ = nullptr;
   }
 
+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
+  void* ptr() const noexcept { return ptr_; }
+
+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
+  size_t size() const noexcept { return size_; }
+
+  void* operator->() const noexcept { return ptr_; }
+  operator bool() const noexcept { return ptr_; }
+  const Place& place() const noexcept { return place_; }
   DeleterFnPtr deleter() const noexcept { return deleter_; }
 
-  template <typename T>
-  T* CastContextWithoutCheck() const noexcept {
-    return static_cast<T*>(ctx_);
-  }
-
-  /// \brief Statically cast the void pointer of the context object to
-  /// the primitive type. Conversion of any pointer to void* and back
-  /// to pointer to the original cv type preserves its original value.
-  /// \param T The primitive type name of the context pointer.
-  /// \param expected_deleter The destructor passed in to enhance type
-  /// safety checking.
-  template <typename T>
-  T* CastContext(DeleterFnPtr expected_deleter) const {
-    PADDLE_ENFORCE_EQ(
-        deleter_ == expected_deleter,
-        true,
-        paddle::platform::errors::InvalidArgument(
-            "The deleter of the allocation does not match, so the pointer "
-            "cannot be safely removed."));
-    return CastContextWithoutCheck<T>();
-  }
-
- private:
+ protected:
   friend void swap(Allocation& a, Allocation& b) noexcept;
-  void* data_{nullptr};
-  void* ctx_{nullptr};
+  void* ptr_{nullptr};
+  size_t size_{};
   DeleterFnPtr deleter_{nullptr};
   // TODO(Shixiaowei02): Enum needs to be used instead to reduce
   // the construction overhead by more than 50%.
@@ -126,28 +87,21 @@ class Allocation final {
 };
 
 inline void swap(Allocation& a, Allocation& b) noexcept {
-  ::std::swap(a.data_, b.data_);
-  ::std::swap(a.ctx_, b.ctx_);
+  ::std::swap(a.ptr_, b.ptr_);
   ::std::swap(a.deleter_, b.deleter_);
   ::std::swap(a.place_, b.place_);
+  ::std::swap(a.size_, b.size_);
 }
 
-/// \brief Context compatible allocator interface. This allocator is
-/// mainly used for general data structures such as Tensor. The raw
-/// allocator is more universal and efficient.
 class Allocator {
-  using Place = paddle::platform::Place;
-
  public:
+  using DeleterType = std::function<void(Allocation*)>;
+  using AllocationPtr = std::unique_ptr<Allocation, DeleterType>;
+
   virtual ~Allocator() = default;
-  virtual Allocation Allocate(size_t bytes_size) = 0;
-  virtual const Place& place() = 0;
-};
+  virtual AllocationPtr Allocate(size_t bytes_size) = 0;
 
-inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
-  CHECK(a);
-  return a->Allocate(n);
-}
+  virtual bool IsAllocThreadSafe() const { return false; }
+};
 
-}  // namespace deprecated
 }  // namespace pten
diff --git a/paddle/pten/core/candidate/allocator.h b/paddle/pten/core/candidate/allocator.h
deleted file mode 100644
index 75d42c4fd15cb13e10c86dd1f1b42700a53b83bd..0000000000000000000000000000000000000000
--- a/paddle/pten/core/candidate/allocator.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <functional>
-#include "paddle/fluid/platform/place.h"
-
-namespace pten {
-
-/// \brief Fancy pointer with deleter. The use of this data type
-/// is to be compatible with allocators from different frameworks
-/// without significant performance loss. This class does not
-/// support being inherited.
-class Allocation {
- public:
-  using Place = paddle::platform::Place;
-  using DeleterFnPtr = void (*)(Allocation*);
-
-  Allocation() = default;
-
-  // Don't own resources, only provide access.
-  Allocation(void* data, size_t size, const Place& place)
-      : ptr_(data), size_(size), place_(place) {}
-
-  // Own resources.
-  Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place)
-      : ptr_(data), size_(size), deleter_(deleter), place_(place) {}
-
-  Allocation(Allocation&& other) noexcept { swap(*this, other); }
-  Allocation& operator=(Allocation&& other) noexcept {
-    // Exchange them explicitly to avoid moving is equivalent
-    // to copying.
-    swap(*this, other);
-    return *this;
-  }
-
-  virtual ~Allocation() {
-    if (deleter_) {
-      deleter_(this);
-    }
-  }
-
-  // Returns the holding pointer.
-  // NOTE: For performance consideration, it is better not to make this method
-  // as a virtual method. If we want to implement a `defragmentation` later,
-  // we might need to make `ptr_` field as a protected field, and add a virtual
-  // method like `defragmentation` to change `ptr_`.
-  void* ptr() const noexcept { return ptr_; }
-
-  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
-  // last valid element.
-  //
-  // NOTE: Some allocator might alloc more memory than request. The size
-  // could larger than its request. For example,
-  //    the AlignedAllocator will always allocate memory as size + kAlignment.
-  //    The raw pointer might not aligned, so an offset might be added to raw
-  //    the pointer. The size of this allocation will be
-  //    `size + kAlignemnt - offset`.
-  size_t size() const noexcept { return size_; }
-
-  void* operator->() const noexcept { return ptr_; }
-  operator bool() const noexcept { return ptr_; }
-  const Place& place() const noexcept { return place_; }
-  DeleterFnPtr deleter() const noexcept { return deleter_; }
-
- protected:
-  friend void swap(Allocation& a, Allocation& b) noexcept;
-  void* ptr_{nullptr};
-  size_t size_{};
-  DeleterFnPtr deleter_{nullptr};
-  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
-  // the construction overhead by more than 50%.
-  Place place_;
-};
-
-inline void swap(Allocation& a, Allocation& b) noexcept {
-  ::std::swap(a.ptr_, b.ptr_);
-  ::std::swap(a.deleter_, b.deleter_);
-  ::std::swap(a.place_, b.place_);
-  ::std::swap(a.size_, b.size_);
-}
-
-class Allocator {
- public:
-  using DeleterType = std::function<void(Allocation*)>;
-  using AllocationPtr = std::unique_ptr<Allocation, DeleterType>;
-
-  virtual ~Allocator() = default;
-  virtual AllocationPtr Allocate(size_t bytes_size) = 0;
-
-  virtual bool IsAllocThreadSafe() const { return false; }
-};
-
-}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 4008b6f6cee8e709eabf59c1344624b15bbbb565..b1a5015f010c2002b8e5dbb6fc9eac1269224ad1 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -33,28 +33,17 @@ extern void TensorCopy(const pten::DenseTensor& src,
 namespace pten {
 
 DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
-    : meta_(meta),
-      storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
+    : meta_(meta), holder_(a->Allocate(SizeOf(dtype()) * numel())) {}
 
 DenseTensor::DenseTensor(Allocator* a, DenseTensorMeta&& meta)
-    : meta_(std::move(meta)),
-      storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
+    : meta_(std::move(meta)), holder_(a->Allocate(SizeOf(dtype()) * numel())) {}
 
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
+DenseTensor::DenseTensor(const std::shared_ptr<pten::Allocation>& holder,
                          const DenseTensorMeta& meta)
-    : meta_(meta), storage_(std::move(storage)) {}
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
-    : meta_(std::move(meta)), storage_(std::move(storage)) {}
+    : meta_(meta), holder_(holder) {}
 
 DenseTensor::DenseTensor(const DenseTensor& other) : meta_(other.meta()) {
-  if (storage_ == nullptr) {
-    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
-        paddle::platform::CPUPlace());
-  }
-  if (other.storage_ != nullptr && other.storage_->data_shared()) {
-    storage_->set_data_shared(other.storage_->data_shared());
-  }
+  holder_ = other.holder_;
 
 #ifdef PADDLE_WITH_MKLDNN
   format_ = other.format_;
@@ -63,13 +52,7 @@ DenseTensor::DenseTensor(const DenseTensor& other) : meta_(other.meta()) {
 
 DenseTensor& DenseTensor::operator=(const DenseTensor& other) {
   meta_ = other.meta();
-  if (storage_ == nullptr) {
-    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
-        paddle::platform::CPUPlace());
-  }
-  if (other.storage_ != nullptr && other.storage_->data_shared()) {
-    storage_->set_data_shared(other.storage_->data_shared());
-  }
+  holder_ = other.holder_;
 #ifdef PADDLE_WITH_MKLDNN
   format_ = other.format_;
 #endif
@@ -78,7 +61,7 @@ DenseTensor& DenseTensor::operator=(const DenseTensor& other) {
 
 DenseTensor& DenseTensor::operator=(DenseTensor&& other) {
   meta_ = std::move(other.meta_);
-  storage_.swap(other.storage_);
+  std::swap(holder_, other.holder_);
   return *this;
 }
 
@@ -90,59 +73,7 @@ int64_t DenseTensor::numel() const {
 }
 
 bool DenseTensor::IsSharedWith(const DenseTensor& b) const {
-  return storage_.get() == b.storage_.get() && storage_.get() != nullptr;
-}
-
-void* DenseTensor::mutable_data(size_t request_bytes) {
-  PADDLE_ENFORCE(
-      valid(),
-      paddle::platform::errors::PreconditionNotMet(
-          "The meta data must be valid when call the mutable data function."));
-  PADDLE_ENFORCE_NOT_NULL(
-      storage_,
-      paddle::platform::errors::PreconditionNotMet(
-          "The storage must be valid when call the mutable data function."));
-  size_t bytes = numel() * SizeOf(dtype());
-  if (request_bytes) {
-    PADDLE_ENFORCE_GE(request_bytes,
-                      bytes,
-                      paddle::platform::errors::InvalidArgument(
-                          "The reserved size %d should be enough to meet the "
-                          "volume required by metadata %d.",
-                          request_bytes,
-                          bytes));
-    bytes = request_bytes;
-  }
-  if (!storage_->data() || storage_->size() < bytes + meta_.offset ||
-      storage_->size() == 0) {
-    VLOG(10) << "mutbale data realloc, original size: " << storage_->size()
-             << ", new size: " << bytes;
-    storage_->Realloc(bytes);
-    meta_.offset = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
-                                 meta_.offset);
-}
-
-template <typename T>
-T* DenseTensor::mutable_data() {
-  // In order to be compatible with the original Tensor design and
-  // execution system, we have to reset the datatype in mutable_data<T>.
-  // When the compatibility phase is over in the future, we can delete it
-  if (meta_.dtype == DataType::UNDEFINED) {
-    VLOG(10) << "change data type in mutbale_data, target dtype - "
-             << paddle::experimental::CppTypeToDataType<T>::Type();
-    const_cast<DataType&>(meta_.dtype) =
-        paddle::experimental::CppTypeToDataType<T>::Type();
-  }
-  PADDLE_ENFORCE(
-      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::InvalidArgument(
-          "The type of data (%d) we are trying to retrieve does not match the "
-          "type of data currently contained in the container (%d).",
-          static_cast<int>(paddle::experimental::CppTypeToDataType<T>::Type()),
-          static_cast<int>(dtype())));
-  return static_cast<T*>(mutable_data());
+  return holder_ && holder_ == b.Holder();
 }
 
 template <typename T>
@@ -164,29 +95,27 @@ T* DenseTensor::data() {
       paddle::platform::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
-  PADDLE_ENFORCE_NOT_NULL(
-      storage_,
-      paddle::platform::errors::PreconditionNotMet(
-          "The storage must be valid when call the mutable data function."));
-  return reinterpret_cast<T*>(data());
+  return static_cast<T*>(data());
 }
 
 void* DenseTensor::data() {
+  check_memory_size();
   PADDLE_ENFORCE_NOT_NULL(
-      storage_,
+      holder_,
       paddle::platform::errors::PreconditionNotMet(
-          "The storage must be valid when call the mutable data function."));
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+          "The storage must be valid when call the data function."));
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  meta_.offset);
 }
 
 const void* DenseTensor::data() const {
+  check_memory_size();
   PADDLE_ENFORCE_NOT_NULL(
-      storage_,
+      holder_,
       paddle::platform::errors::PreconditionNotMet(
-          "The storage must be valid when call the mutable data function."));
+          "The storage must be valid when call the data function."));
   return reinterpret_cast<const void*>(
-      reinterpret_cast<uintptr_t>(storage_->data()) + meta_.offset);
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
 }
 
 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
@@ -209,15 +138,14 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
    */
 void DenseTensor::ResizeAndAllocate(const DDim& dims) {
   meta_.dims = dims;
-  if (storage_ != nullptr) {
-    mutable_data();
+  if (holder_ != nullptr && place().GetType() != AllocationType::UNDEFINED) {
+    mutable_data(place());
   }
 }
 
 void DenseTensor::ResetLoD(const LoD& lod) { meta_.lod = lod; }
 
 #define DATA_MEMBER_FUNC_INSTANTIATION(dtype)      \
-  template dtype* DenseTensor::mutable_data();     \
   template const dtype* DenseTensor::data() const; \
   template dtype* DenseTensor::data();
 
@@ -243,68 +171,47 @@ DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128);
 /*   From framework::Tensor    */
 /* --------------------------- */
 DenseTensor::DenseTensor() {
-  storage_ = make_intrusive<paddle::experimental::SharedStorage>(
-      paddle::platform::CPUPlace());
   inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
   meta_.dtype = paddle::experimental::DataType::FLOAT32;
   meta_.offset = 0;
 }
 
-DenseTensor::DenseTensor(const paddle::framework::proto::VarType::Type& dtype) {
-  storage_ = make_intrusive<paddle::experimental::SharedStorage>(
-      paddle::platform::CPUPlace());
+DenseTensor::DenseTensor(paddle::framework::proto::VarType::Type dtype) {
   inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
   meta_.dtype = TransToPtenDataType(dtype);
   meta_.offset = 0;
 }
 
 size_t DenseTensor::memory_size() const {
-  if (storage_ == nullptr || storage_->data_shared() == nullptr) {
-    return 0UL;
-  }
-
-  return storage_->data_shared()->size() - meta_.offset;
+  return holder_ == nullptr ? 0UL : holder_->size() - meta_.offset;
 }
 
 void DenseTensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(storage_,
+  PADDLE_ENFORCE_NOT_NULL(holder_,
                           paddle::platform::errors::PreconditionNotMet(
                               "Tensor holds no memory. "
                               "Call Tensor::mutable_data firstly."));
-  PADDLE_ENFORCE_NOT_NULL(storage_->data_shared(),
-                          paddle::platform::errors::PreconditionNotMet(
-                              "Tensor holds no memory. "
-                              "Call Tensor::mutable_data firstly."));
-  size_t size = numel() * SizeOf(dtype());
-
   PADDLE_ENFORCE_LE(
-      size,
+      numel() * SizeOf(dtype()),
       memory_size(),
       paddle::platform::errors::PreconditionNotMet(
           "Tensor's dimension is out of bound."
           "Tensor's dimension must be equal or less than the size of its "
           "memory."
-          "But received  Tensor's dimension is d%, memory's size is %d.",
-          size,
+          "But received Tensor's dimension is d%, memory's size is %d.",
+          numel() * SizeOf(dtype()),
           memory_size()));
 }
 
 const paddle::platform::Place& DenseTensor::place() const {
   PADDLE_ENFORCE_NOT_NULL(
-      storage_,
+      holder_,
       paddle::platform::errors::PreconditionNotMet(
-          "Tensor not initialized yet when Tensor::place() is called."));
-  if (storage_->data_shared()) {
-    return storage_->data_shared()->place();
-  }
-  return storage_->place();
+          "Tensor not initialized yet when DenseTensor::place() is called."));
+  return holder_->place();
 }
 
 paddle::framework::proto::VarType::Type DenseTensor::type() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      storage_,
-      paddle::platform::errors::PreconditionNotMet(
-          "Tensor not initialized yet when Tensor::type() is called."));
   return TransToProtoVarType(meta_.dtype);
 }
 
@@ -316,39 +223,31 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
   meta_.layout = layout;
 }
 
-void DenseTensor::ResetHolder(
-    const std::shared_ptr<paddle::memory::Allocation>& holder) {
+void DenseTensor::ResetHolder(const std::shared_ptr<pten::Allocation>& holder) {
   PADDLE_ENFORCE_EQ(
       meta_.offset,
       0,
       paddle::platform::errors::Fatal(
           "Only the offset is supported to zero when the holder is reset."));
 
-  PADDLE_ENFORCE_NOT_NULL(
-      storage_,
-      paddle::platform::errors::PreconditionNotMet(
-          "The storage must be valid when call the mutable data function."));
-
-  if (storage_->data_shared()) {
+  if (holder_) {
     PADDLE_ENFORCE_LE(
         numel() * SizeOf(dtype()) + meta_.offset,
         holder->size(),
         paddle::platform::errors::InvalidArgument(
             "The size of Holder is not enough to store the Tensor."));
   }
-
-  storage_->set_data_shared(holder);
+  holder_ = holder;
 }
 
 void DenseTensor::ResetHolderWithType(
-    const std::shared_ptr<paddle::memory::Allocation>& holder,
-    const paddle::framework::proto::VarType::Type& type) {
+    const std::shared_ptr<pten::Allocation>& holder,
+    paddle::framework::proto::VarType::Type type) {
   set_type(type);
   ResetHolder(holder);
 }
 
-void DenseTensor::set_type(
-    const paddle::framework::proto::VarType::Type& type) {
+void DenseTensor::set_type(paddle::framework::proto::VarType::Type type) {
   meta_.dtype = TransToPtenDataType(type);
 }
 
@@ -369,19 +268,14 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place,
     size = requested_size;
   }
 
-  if (storage_ == nullptr) {
-    storage_ = make_intrusive<paddle::experimental::SharedStorage>(place);
-  }
-
   /* some versions of boost::variant don't have operator!= */
-  if (storage_->data_shared() == nullptr ||
-      !(storage_->data_shared()->place() == place) ||
-      storage_->data_shared()->size() < size + meta_.offset) {
-    storage_->Clear();
-    storage_->set_data_shared(paddle::memory::AllocShared(place, size));
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + meta_.offset) {
+    holder_.reset();
+    holder_ = paddle::memory::AllocShared(place, size);
     meta_.offset = 0;
   }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  meta_.offset);
 }
 
@@ -404,21 +298,16 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place,
           "] now"));
   size_t size = numel() * SizeOf(dtype());
 
-  if (storage_ == nullptr) {
-    storage_ = make_intrusive<paddle::experimental::SharedStorage>(place);
-  }
-
   /* some versions of boost::variant don't have operator!= */
-  if (storage_->data_shared() == nullptr ||
-      !(storage_->data_shared()->place() == place) ||
-      storage_->data_shared()->size() < size + meta_.offset ||
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + meta_.offset ||
       !(paddle::platform::is_gpu_place(place) &&
-        paddle::memory::InSameStream(storage_->data_shared(), stream))) {
-    storage_->Clear();
-    storage_->set_data_shared(paddle::memory::AllocShared(place, size, stream));
+        paddle::memory::InSameStream(holder_, stream))) {
+    holder_.reset();
+    holder_ = paddle::memory::AllocShared(place, size, stream);
     meta_.offset = 0;
   }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  meta_.offset);
 }
 
@@ -445,14 +334,9 @@ inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
 }
 
 void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
-  if (storage_ == nullptr) {
-    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
-        paddle::platform::CPUPlace());
-  }
-  if (storage_ != nullptr && tensor.storage_ != nullptr) {
-    storage_->set_data_shared(tensor.storage_->data_shared());
-  }
+  holder_ = tensor.holder_;
   meta_.offset = tensor.meta().offset;
+  meta_.dtype = tensor.dtype();
 }
 
 #define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \
@@ -467,7 +351,7 @@ LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool)
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t)
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(uint8_t)
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int16_t)
-LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int32_t)
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t)
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float)
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double)
@@ -482,6 +366,13 @@ LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128)
 /*   From framework::LoDTensor    */
 /* ------------------------------ */
 
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
+                         const DenseTensorMeta& meta)
+    : meta_(meta), holder_(storage->move_data_shared()) {}
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
+    : meta_(std::move(meta)), holder_(storage->move_data_shared()) {}
+
 DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; }
 
 void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; }
@@ -559,9 +450,8 @@ DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   } else {
     size_t base = numel() / meta_.dims[0];
     DenseTensor dst;
-    dst.storage_ = pten::make_intrusive<paddle::experimental::SharedStorage>(
-        storage_->data_shared());
-    dst.meta_.layout = meta_.layout;
+    dst.holder_ = holder_;
+    dst.set_layout(meta_.layout);
     dst.meta_.dtype = meta_.dtype;
     DDim dst_dims = meta_.dims;
     dst_dims[0] = end_idx - begin_idx;
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 216689c9b64173166f3c33cc2b0cbaa61e42a66a..88c459e6d87eaee4cd52111c42458868698eda43 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -70,17 +70,8 @@ class DenseTensor : public TensorBase,
   /// \param meta The meta data of dense tensor.
   DenseTensor(Allocator* a, DenseTensorMeta&& meta);
 
-  /// \brief Use existing storage space to create dense tensor. This interface
-  /// can be used to deliberately create an uninitialized dense tensor.
-  /// \param storage The existing storage.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
-
-  /// \brief Use existing storage space to create dense tensor. This interface
-  /// can be used to deliberately create an uninitialized dense tensor.
-  /// \param storage The existing storage.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
+  DenseTensor(const std::shared_ptr<pten::Allocation>& holder,
+              const DenseTensorMeta& meta);
 
   /// \brief Because dense tensor is a kind of container, we give a default
   /// constructor to use for stl container. But the dense tensor created with
@@ -146,9 +137,7 @@ class DenseTensor : public TensorBase,
 
   /// \brief Test whether the storage is allocated.
   /// return Whether the storage is allocated.
-  bool initialized() const override {
-    return storage_ != nullptr && storage_->data() != nullptr;
-  }
+  bool initialized() const override { return holder_ && holder_->ptr(); }
 
   /// \brief Check if storage is shared with other objects.
   /// \return Whether the storage is shared with other objects.
@@ -170,25 +159,7 @@ class DenseTensor : public TensorBase,
   /// \brief Returns the actual storage size occupied by tensor, may be larger
   /// than its shape dims.
   /// \return The actual storage size occupied by tensor.
-  size_t capacity() const { return storage_->size(); }
-
-  /// \brief Get the mutable data pointer value of type T.
-  /// Memory allocation may occur when calling this interface:
-  /// 1. When the storage size is not enough to meet the current shape of the
-  /// data.
-  /// \return The mutable data pointer value of type T.
-  template <typename T>
-  T* mutable_data();
-
-  /// \brief Get the mutable data pointer value of raw type.
-  /// Memory allocation may occur when calling this interface:
-  /// 1. When the storage size is not enough to meet the current shape of the
-  /// data.
-  /// 2. When more request_bytes parameters are used to reserve the data
-  /// storage.
-  /// param request_bytes The bytes to reserve the data storage.
-  /// \return The mutable data pointer value of type T.
-  void* mutable_data(size_t request_bytes = 0);
+  size_t capacity() const { return holder_->size(); }
 
   /// \brief Get the const data pointer value of type T.
   /// \return The const data pointer value of type T.
@@ -204,7 +175,7 @@ class DenseTensor : public TensorBase,
 
  protected:
   DenseTensorMeta meta_;
-  intrusive_ptr<Storage> storage_;
+  std::shared_ptr<pten::Allocation> holder_;
 
   /* --------------------------- */
   /*   From framework::Tensor    */
@@ -223,11 +194,21 @@ class DenseTensor : public TensorBase,
 
   /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
    */
-  explicit DenseTensor(const paddle::framework::proto::VarType::Type& dtype);
+  explicit DenseTensor(paddle::framework::proto::VarType::Type dtype);
 
-  inline bool IsInitialized() const {
-    return storage_ != nullptr && storage_->data_shared() != nullptr;
-  }
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
+
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
+
+  inline bool IsInitialized() const { return holder_ != nullptr; }
 
   template <typename T>
   T* data();
@@ -270,7 +251,7 @@ class DenseTensor : public TensorBase,
   void set_layout(const paddle::framework::DataLayout layout);
 
   void clear() {
-    storage_.reset();
+    holder_.reset();
     meta_.offset = 0;
   }
 
@@ -281,31 +262,24 @@ class DenseTensor : public TensorBase,
   }
 
   bool IsSharedBufferWith(const DenseTensor& src) const {
-    if (storage_ == nullptr || src.storage_ == nullptr) return false;
-    if (storage_->data_shared() == src.storage_->data_shared()) return true;
-
-    return false;
+    return holder_ && holder_ == src.Holder();
   }
 
-  const std::shared_ptr<paddle::memory::Allocation> Holder() const {
-    return storage_ == nullptr ? nullptr : std::move(storage_->data_shared());
-  }
+  const std::shared_ptr<pten::Allocation>& Holder() const { return holder_; }
 
   void set_offset(size_t offset) { meta_.offset = offset; }
   size_t offset() const { return meta_.offset; }
 
-  std::shared_ptr<paddle::memory::Allocation> MoveMemoryHolder() {
-    return storage_ == nullptr ? nullptr
-                               : std::move(storage_->move_data_shared());
+  std::shared_ptr<pten::Allocation> MoveMemoryHolder() {
+    return std::move(holder_);
   }
 
-  void ResetHolder(const std::shared_ptr<paddle::memory::Allocation>& holder);
+  void ResetHolder(const std::shared_ptr<pten::Allocation>& holder);
 
-  void ResetHolderWithType(
-      const std::shared_ptr<paddle::memory::Allocation>& holder,
-      const paddle::framework::proto::VarType::Type& type);
+  void ResetHolderWithType(const std::shared_ptr<pten::Allocation>& holder,
+                           paddle::framework::proto::VarType::Type type);
 
-  void set_type(const paddle::framework::proto::VarType::Type& type);
+  void set_type(paddle::framework::proto::VarType::Type type);
 
   TensorInplaceVersion& InplaceVersionCounter() {
     return *inplace_version_counter_;
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
index bb851d954f241fce496f0182c3094d066c72c2d6..1ee2e21494bf544c130ede20ea84c11ae94ca812 100644
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -19,7 +19,7 @@ limitations under the License. */
 // TODO(wilber): Do we need to use place in pten kernel?
 #include "paddle/pten/common/place.h"
 
-#include "paddle/pten/core/candidate/allocator.h"
+#include "paddle/pten/core/allocator.h"
 
 namespace pten {
 class TensorBase;
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index 97d7f8d0f110534c9279c5c5717e0151314a60c9..fc8b5dfaab70fb5ea9eb9b3334fbef1d065f88b0 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -56,18 +56,14 @@ class Storage : public intrusive_ref_counter<Storage> {
                  : nullptr;
   }
 
-  const std::shared_ptr<paddle::memory::Allocation> data_shared() const {
+  const std::shared_ptr<paddle::memory::Allocation>& data_shared() const {
     return data_;
   }
 
   virtual void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) {
-    data_ = holder;
-  }
+      const std::shared_ptr<paddle::memory::Allocation>& holder) = 0;
 
-  std::shared_ptr<paddle::memory::Allocation> move_data_shared() {
-    return std::move(data_);
-  }
+  virtual std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() = 0;
 
   virtual void ReallocShared(size_t n) {
     PADDLE_THROW(paddle::platform::errors::Unimplemented(
@@ -123,6 +119,18 @@ class TensorStorage : public Storage {
 
   bool OwnsMemory() const noexcept override { return true; }
 
+  void set_data_shared(
+      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
+    CHECK(holder);
+    data_ = holder;
+    size_ = holder->size();
+  }
+
+  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
+    size_ = 0;
+    return std::move(data_);
+  }
+
  private:
   Allocator* alloc_;
   int64_t size_{0};
diff --git a/paddle/pten/infermeta/CMakeLists.txt b/paddle/pten/infermeta/CMakeLists.txt
index f92727f33fb05330394226dbdce114b90f4a86ff..8e50d9d2c90d435eddd75f110ca7de38e11c9044 100644
--- a/paddle/pten/infermeta/CMakeLists.txt
+++ b/paddle/pten/infermeta/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils)
+cc_library(backward_infermeta SRCS backward.cc DEPS convert_utils)
diff --git a/paddle/pten/infermeta/backward.cc b/paddle/pten/infermeta/backward.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a66e8cd2ecb384be7bd807269cce290e8f8e04e
--- /dev/null
+++ b/paddle/pten/infermeta/backward.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/infermeta/backward.h"
+
+namespace pten {
+
+std::tuple<DenseTensorMeta, DenseTensorMeta> MatmulGradInferMeta(
+    const DenseTensorMeta& x_meta,
+    const DenseTensorMeta& y_meta,
+    const DenseTensorMeta& out_grad_meta,
+    bool transpose_x,
+    bool transpose_y) {
+  return std::make_tuple(x_meta, y_meta);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/infermeta/backward.h b/paddle/pten/infermeta/backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..03bdb3a962a96e84f7ed569c18d3b73fad145a78
--- /dev/null
+++ b/paddle/pten/infermeta/backward.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <tuple>
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+std::tuple<DenseTensorMeta, DenseTensorMeta> MatmulGradInferMeta(
+    const DenseTensorMeta& x_meta,
+    const DenseTensorMeta& y_meta,
+    const DenseTensorMeta& out_grad_meta,
+    bool transpose_x,
+    bool transpose_y);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc
index a0006f49a2b383ccbab7b27319f2cf2fcc51be78..edb8f59e2677199dde7ca1a0ae7fed76e655e81f 100644
--- a/paddle/pten/kernels/cpu/cast_kernel.cc
+++ b/paddle/pten/kernels/cpu/cast_kernel.cc
@@ -36,7 +36,7 @@ void CastKernelImpl(const CPUContext& dev_ctx,
   auto numel = x.numel();
   auto* in_end = in_begin + numel;
 
-  auto* out_begin = out->mutable_data<OutT>();
+  auto* out_begin = out->mutable_data<OutT>(dev_ctx.GetPlace());
 
   paddle::platform::Transform<CPUContext> trans;
   trans(dev_ctx,
diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc
index 1889838e253c93d986a22c746647746013e05ba6..be5170f4d05aab459df45fc6c36e0f34511c22b0 100644
--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -32,17 +32,16 @@ void Copy(const Context& dev_ctx,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
   const auto& src_place = src.place();
-  const auto& dst_place = dst->place();
 
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
-          << dst_place;
+          << src_place;
 
-  dst->ResizeAndAllocate(src.dims());
-  auto* dst_ptr = dst->mutable_data();
+  dst->Resize(src.dims());
+  auto* dst_ptr = dst->mutable_data(src_place);
 
-  if (src_ptr == dst_ptr && src_place == dst_place) {
+  if (src_ptr == dst_ptr) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-            << dst_place;
+            << src_place;
     return;
   }
   VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
@@ -51,9 +50,8 @@ void Copy(const Context& dev_ctx,
   auto size = src.numel() *
               paddle::framework::SizeOfType(TransToProtoVarType(src.dtype()));
 
-  if (paddle::platform::is_cpu_place(src_place) &&
-      paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  if (paddle::platform::is_cpu_place(src_place)) {
+    paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
   }
 }
 
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
index 5745737bbaecc5ee2a56c66752cc0e37f8487c1e..e6ffd3b5000b3f8152d6d2f9840b5379408022e7 100644
--- a/paddle/pten/kernels/cpu/dot_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx,
                DenseTensor* out) {
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>();
+  auto* z = out->mutable_data<T>(dev_ctx.GetPlace());
 
   // Loop over the total N elements of both operands while sum-reducing every
   // B pairs along the way where B is the dimension of the least ordered axis
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index e4f426d3f8eb4895ccaf209fd7626e2f083261c5..6bfde977ce51789d1d62736338f1098a8d4783a7 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -45,7 +45,10 @@ struct SameDimsAddFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VADD(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+    blas.VADD(x.numel(),
+              x.data<T>(),
+              y.data<T>(),
+              z->mutable_data<T>(dev_ctx.GetPlace()));
   }
 };
 
@@ -58,7 +61,7 @@ struct SameDimsAddFunctor<
                   const DenseTensor& x,
                   const DenseTensor& y,
                   DenseTensor* z) {
-    z->mutable_data<T>();
+    z->mutable_data<T>(dev_ctx.GetPlace());
     auto eigen_x = pten::EigenVector<T>::Flatten(x);
     auto eigen_y = pten::EigenVector<T>::Flatten(y);
     auto eigen_z = pten::EigenVector<T>::Flatten(*z);
@@ -86,7 +89,10 @@ struct SameDimsSubtractFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+    blas.VSUB(x.numel(),
+              x.data<T>(),
+              y.data<T>(),
+              z->mutable_data<T>(dev_ctx.GetPlace()));
   }
 };
 
@@ -141,7 +147,10 @@ struct SameDimsDivideFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VDIV(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+    blas.VDIV(x.numel(),
+              x.data<T>(),
+              y.data<T>(),
+              z->mutable_data<T>(dev_ctx.GetPlace()));
   }
 };
 
@@ -164,7 +173,10 @@ struct SameDimsMultiplyFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VMUL(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+    blas.VMUL(x.numel(),
+              x.data<T>(),
+              y.data<T>(),
+              z->mutable_data<T>(dev_ctx.GetPlace()));
   }
 };
 
@@ -280,7 +292,7 @@ void CommonForwardBroadcastCPU(const DenseTensor& x,
   PADDLE_ENFORCE_NOT_NULL(y_data,
                           paddle::platform::errors::InvalidArgument(
                               "The input Y should not be empty."));
-  OutType* out_data = z->mutable_data<OutType>();
+  OutType* out_data = z->mutable_data<OutType>(ctx.GetPlace());
 
   const int out_size = std::accumulate(
       out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
@@ -361,7 +373,7 @@ void ElementwiseCompute(const CPUContext& dev_ctx,
                         int axis,
                         Functor func,
                         DenseTensor* z) {
-  z->mutable_data<OutType>();
+  z->mutable_data<OutType>(dev_ctx.GetPlace());
   auto x_dims = x.dims();
   auto y_dims = y.dims();
   bool is_xsize_larger = true;
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index 706a40936a3935499161449555fb58c84a45b02a..6d76626605c5c7bd3ea39470e824c57cb2a6484d 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -37,7 +37,7 @@ namespace pten {
                        const DenseTensor& y,                                \
                        int axis,                                            \
                        DenseTensor* out) {                                  \
-    out->mutable_data<T>();                                                 \
+    out->mutable_data<T>(dev_ctx.GetPlace());                               \
     if (x.dims() == y.dims()) {                                             \
       SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
           dev_ctx, x, y, out);                                              \
@@ -85,7 +85,7 @@ void DivideRawKernel(const Context& dev_ctx,
                      int axis,
                      DenseTensor* out) {
   // allocate memory for out
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
   if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
     SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
         dev_ctx, x, y, out);
diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h
index 86443c254bf67388d2613fa1078266edb81319a0..8f84bd0515b516e25821f8fa84d6935aa6260032 100644
--- a/paddle/pten/kernels/cpu/reduce.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -119,7 +119,7 @@ void GetShuffledInput(const DeviceContext& dev_ctx,
   GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis);
 
   shuffled_input->ResizeAndAllocate(shuffled_dims);
-  shuffled_input->mutable_data<OutT>();
+  shuffled_input->mutable_data<OutT>(dev_ctx.GetPlace());
 
   pten::math::TransposeNormal<DeviceContext, OutT> trans;
   trans(dev_ctx, input, shuffled_input, perm_axis);
@@ -158,7 +158,7 @@ void ReduceKernelImpl(const DeviceContext& dev_ctx,
                       const std::vector<int64_t>& dims,
                       bool keep_dim,
                       bool reduce_all) {
-  output->mutable_data<OutT>();
+  output->mutable_data<OutT>(dev_ctx.GetPlace());
 
   if (reduce_all) {
     // Flatten and reduce 1-D tensor
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index 52949b589696c79a36535699a162c8957eda56f9..774d3891b03726a940b6f31a4058e37f3c79277d 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -33,7 +33,7 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out) {
   // calc
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
   auto eigen_out = pten::EigenVector<T>::Flatten(*out);
   auto eigen_x = pten::EigenVector<T>::Flatten(x);
   auto& dev = *dev_ctx.eigen_device();
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index 2deac0146c52c267e65c9b9587cae1050874b66c..6ce4998287956c29140c8c3690661b2e92a6f450 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -29,7 +29,7 @@ void EmptyKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) {
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/funcs/eigen/CMakeLists.txt b/paddle/pten/kernels/funcs/eigen/CMakeLists.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8b64e35b93526eb7edbe7f723832126ef7f0e0a6 100644
--- a/paddle/pten/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/pten/kernels/funcs/eigen/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
+if(WITH_GPU)
+  nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
+elseif(WITH_ROCM)
+  hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
+else()
+  cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
+endif()
diff --git a/paddle/fluid/operators/eigen/broadcast.cc b/paddle/pten/kernels/funcs/eigen/broadcast.cc
similarity index 77%
rename from paddle/fluid/operators/eigen/broadcast.cc
rename to paddle/pten/kernels/funcs/eigen/broadcast.cc
index dab25f95493726a1b9a459ea0f6f3f33ad7bb22e..8d6c6c1e82bf0a1ac5ccc9ca3aa4d2328002df0b 100644
--- a/paddle/fluid/operators/eigen/broadcast.cc
+++ b/paddle/pten/kernels/funcs/eigen/broadcast.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenBroadcast<Eigen::DefaultDevice, T, Rank> {
@@ -31,13 +31,17 @@ struct EigenBroadcast<Eigen::DefaultDevice, T, Rank> {
       Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
                        Eigen::Aligned>;
 
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   InType in,
                    const Array& bcast) {
     out.device(dev) = in.broadcast(bcast);
   }
 
-  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
-                   InType32BitIndex in, const Array& bcast) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType32BitIndex out,
+                   InType32BitIndex in,
+                   const Array& bcast) {
     out.device(dev) = in.broadcast(bcast);
   }
 };
@@ -50,8 +54,11 @@ struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
-                   const Array& reduce_dims, const Array2& reshape_dims) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   InType in,
+                   const Array& reduce_dims,
+                   const Array2& reshape_dims) {
     out.device(dev) =
         in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
   }
@@ -65,14 +72,14 @@ struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
   template struct FUNCTOR<Eigen::DefaultDevice, T, 5>; \
   template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
-INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, dtype::float16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
 INSTANTIATION(EigenBroadcast, int64_t);
 INSTANTIATION(EigenBroadcastGrad, bool);
 INSTANTIATION(EigenBroadcastGrad, float);
-INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, dtype::float16);
 INSTANTIATION(EigenBroadcastGrad, double);
 INSTANTIATION(EigenBroadcastGrad, int);
 INSTANTIATION(EigenBroadcastGrad, int64_t);
@@ -82,5 +89,5 @@ template struct EigenBroadcastGrad<Eigen::DefaultDevice, int, 0>;
 template struct EigenBroadcastGrad<Eigen::DefaultDevice, int64_t, 0>;
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/broadcast.cu b/paddle/pten/kernels/funcs/eigen/broadcast.cu
similarity index 76%
rename from paddle/fluid/operators/eigen/broadcast.cu
rename to paddle/pten/kernels/funcs/eigen/broadcast.cu
index 63e244d393a9bcbbc4537fadd7c8bc996643e43f..5a9c26358a3e51a0ea9c138fa51a9db41c5818ea 100644
--- a/paddle/fluid/operators/eigen/broadcast.cu
+++ b/paddle/pten/kernels/funcs/eigen/broadcast.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenBroadcast<Eigen::GpuDevice, T, Rank> {
@@ -31,13 +31,17 @@ struct EigenBroadcast<Eigen::GpuDevice, T, Rank> {
       Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
                        Eigen::Aligned>;
 
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   InType in,
                    const Array& bcast) {
     out.device(dev) = in.broadcast(bcast);
   }
 
-  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
-                   InType32BitIndex in, const Array& bcast) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType32BitIndex out,
+                   InType32BitIndex in,
+                   const Array& bcast) {
     out.device(dev) = in.broadcast(bcast);
   }
 };
@@ -50,8 +54,11 @@ struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
-                   const Array& reduce_dims, const Array2& reshape_dims) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   InType in,
+                   const Array& reduce_dims,
+                   const Array2& reshape_dims) {
     out.device(dev) =
         in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
   }
@@ -65,23 +72,23 @@ struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
   template struct FUNCTOR<Eigen::GpuDevice, T, 5>; \
   template struct FUNCTOR<Eigen::GpuDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
-INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, dtype::float16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
 INSTANTIATION(EigenBroadcast, int64_t);
 INSTANTIATION(EigenBroadcastGrad, bool);
 INSTANTIATION(EigenBroadcastGrad, float);
-INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, dtype::float16);
 INSTANTIATION(EigenBroadcastGrad, double);
 INSTANTIATION(EigenBroadcastGrad, int);
 INSTANTIATION(EigenBroadcastGrad, int64_t);
 template struct EigenBroadcastGrad<Eigen::GpuDevice, float, 0>;
-template struct EigenBroadcastGrad<Eigen::GpuDevice, platform::float16, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, dtype::float16, 0>;
 template struct EigenBroadcastGrad<Eigen::GpuDevice, double, 0>;
 template struct EigenBroadcastGrad<Eigen::GpuDevice, int, 0>;
 template struct EigenBroadcastGrad<Eigen::GpuDevice, int64_t, 0>;
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/constant.cc b/paddle/pten/kernels/funcs/eigen/constant.cc
similarity index 86%
rename from paddle/fluid/operators/eigen/constant.cc
rename to paddle/pten/kernels/funcs/eigen/constant.cc
index 45b03ccbf10043ad142c7de15d7cdf110e134f9a..5eb25f9bb9a858e585a54cd192d0ab41d8488bd9 100644
--- a/paddle/fluid/operators/eigen/constant.cc
+++ b/paddle/pten/kernels/funcs/eigen/constant.cc
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenConstant<Eigen::DefaultDevice, T, Rank> {
@@ -27,5 +27,5 @@ struct EigenConstant<Eigen::DefaultDevice, T, Rank> {
 
 template struct EigenConstant<Eigen::DefaultDevice, float, 1>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/constant.cu b/paddle/pten/kernels/funcs/eigen/constant.cu
similarity index 80%
rename from paddle/fluid/operators/eigen/constant.cu
rename to paddle/pten/kernels/funcs/eigen/constant.cu
index cf4a2917f7d36f817b53aa892ff1b43b347086c8..c33da70e6a0daf88aa26bc84e85b0ec93be4ae8a 100644
--- a/paddle/fluid/operators/eigen/constant.cu
+++ b/paddle/pten/kernels/funcs/eigen/constant.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenConstant<Eigen::GpuDevice, T, Rank> {
@@ -27,5 +27,5 @@ struct EigenConstant<Eigen::GpuDevice, T, Rank> {
 
 template struct EigenConstant<Eigen::GpuDevice, float, 1>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/eigen/eigen_function.h b/paddle/pten/kernels/funcs/eigen/eigen_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..75ba543dd9ce5d816c44f8ccbc8583f6f46b6313
--- /dev/null
+++ b/paddle/pten/kernels/funcs/eigen/eigen_function.h
@@ -0,0 +1,319 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace pten {
+namespace funcs {
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcast {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   InType in,
+                   const Array& bcast);
+  static void Eval(const EigenDevice& dev,
+                   OutType32BitIndex out,
+                   InType32BitIndex in,
+                   const Array& bcast);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcastGrad {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   InType in,
+                   const Array& reduce_dims,
+                   const Array2& reshape_dims);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenConstant {
+  using Type = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, Type out, const T value);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenSign {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenReverse {
+  using Array = Eigen::DSizes<bool, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& reverse);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenAdd {
+  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<const T,
+                                                         Eigen::Sizes<>,
+                                                         Eigen::RowMajor,
+                                                         Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<T,
+                                                          Eigen::Sizes<>,
+                                                          Eigen::RowMajor,
+                                                          Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T value);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenSub {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& left,
+                   const InType& right);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenSlice {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array32Bit = Eigen::DSizes<int, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& offsets,
+                   const Array& extents);
+  static void Eval(const EigenDevice& dev,
+                   OutType32BitIndex out,
+                   const InType32BitIndex& in,
+                   const Array32Bit& offsets,
+                   const Array32Bit& extents);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenPad {
+  using Array = std::array<std::pair<int64_t, int64_t>, Rank>;
+  using Array32Bit = std::array<std::pair<int, int>, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& padding,
+                   const T value);
+  static void Eval(const EigenDevice& dev,
+                   OutType32BitIndex out,
+                   const InType32BitIndex& in,
+                   const Array32Bit& padding,
+                   const T value);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenScale {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T scale,
+                   const T bias,
+                   const bool bias_after_scale);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenErf {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenErfGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType din,
+                   const InType& in,
+                   const InType& dout);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenRankLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& label,
+                   const InType& left,
+                   const InType& right);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenRankLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void EvalLeft(const EigenDevice& dev,
+                       OutType dleft,
+                       const InType& dout,
+                       const InType& label,
+                       const InType& left,
+                       const InType& right);
+  static void EvalRight(const EigenDevice& dev,
+                        OutType dright,
+                        const InType& dout,
+                        const InType& label,
+                        const InType& left,
+                        const InType& right);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenLogLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& pred,
+                   const InType& label,
+                   const T& epsilon);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenLogLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType dpred,
+                   const InType& dloss,
+                   const InType& pred,
+                   const InType& label,
+                   const T& epsilon);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenHingeLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType loss,
+                   const InType& pred,
+                   const InType& label);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenHingeLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType dpred,
+                   const InType& dloss,
+                   const InType& pred,
+                   const InType& label);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenL1Norm {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<T,
+                                                          Eigen::Sizes<>,
+                                                          Eigen::RowMajor,
+                                                          Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenL1NormGrad {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, 1>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType din,
+                   const InType& dout,
+                   const InType& in,
+                   const Array& bcast);
+};
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/elementwise.cc b/paddle/pten/kernels/funcs/eigen/elementwise.cc
similarity index 52%
rename from paddle/fluid/operators/eigen/elementwise.cc
rename to paddle/pten/kernels/funcs/eigen/elementwise.cc
index bedecfe5c224feda5126050be1f80843db5b0a87..700bd1363c95d452e2251780e1cabe54adfe6ba8 100644
--- a/paddle/fluid/operators/eigen/elementwise.cc
+++ b/paddle/pten/kernels/funcs/eigen/elementwise.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,19 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenAdd<Eigen::DefaultDevice, T> {
-  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& in, const T value) {
+  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<const T,
+                                                         Eigen::Sizes<>,
+                                                         Eigen::RowMajor,
+                                                         Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<T,
+                                                          Eigen::Sizes<>,
+                                                          Eigen::RowMajor,
+                                                          Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T value) {
     out.device(dev) = in + value;
   }
 };
@@ -39,13 +45,15 @@ struct EigenSub<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& left, const InType& right) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& left,
+                   const InType& right) {
     out.device(dev) = left - right;
   }
 };
 
 template struct EigenSub<Eigen::DefaultDevice, float>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/elementwise.cu b/paddle/pten/kernels/funcs/eigen/elementwise.cu
similarity index 55%
rename from paddle/fluid/operators/eigen/elementwise.cu
rename to paddle/pten/kernels/funcs/eigen/elementwise.cu
index a750a06284f5e44fa71440820e2c40c0868f4e6f..999046b74a99d4fc84c5afaffef123bf7edf4abd 100644
--- a/paddle/fluid/operators/eigen/elementwise.cu
+++ b/paddle/pten/kernels/funcs/eigen/elementwise.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,18 +11,24 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenAdd<Eigen::GpuDevice, T> {
-  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
+  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<const T,
+                                                         Eigen::Sizes<>,
+                                                         Eigen::RowMajor,
+                                                         Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<T,
+                                                          Eigen::Sizes<>,
+                                                          Eigen::RowMajor,
+                                                          Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& in,
                    const T value) {
     out.device(dev) = in + value;
   }
@@ -39,7 +45,9 @@ struct EigenSub<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& left,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& left,
                    const InType& right) {
     out.device(dev) = left - right;
   }
@@ -47,5 +55,5 @@ struct EigenSub<Eigen::GpuDevice, T> {
 
 template struct EigenSub<Eigen::GpuDevice, float>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace fucns
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/erf.cc b/paddle/pten/kernels/funcs/eigen/erf.cc
similarity index 71%
rename from paddle/fluid/operators/eigen/erf.cc
rename to paddle/pten/kernels/funcs/eigen/erf.cc
index 6c2c734c97769418fa9316150c606909acf33eba..1ebb9f84846d38fcf1d268c5c5609d6b9e3a6062 100644
--- a/paddle/fluid/operators/eigen/erf.cc
+++ b/paddle/pten/kernels/funcs/eigen/erf.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,12 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/eigen_ext.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenErf<Eigen::DefaultDevice, T> {
@@ -24,7 +24,8 @@ struct EigenErf<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
                    const InType& in) {
     out.device(dev) = in.erf();
   }
@@ -36,8 +37,10 @@ struct EigenErfGrad<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType din,
-                   const InType& in, const InType& dout) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType din,
+                   const InType& in,
+                   const InType& dout) {
     din.device(dev) =
         dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
   }
@@ -46,10 +49,10 @@ struct EigenErfGrad<Eigen::DefaultDevice, T> {
 #define INSTANTIATION(FUNCTOR)                           \
   template struct FUNCTOR<Eigen::DefaultDevice, float>;  \
   template struct FUNCTOR<Eigen::DefaultDevice, double>; \
-  template struct FUNCTOR<Eigen::DefaultDevice, platform::float16>
+  template struct FUNCTOR<Eigen::DefaultDevice, dtype::float16>
 INSTANTIATION(EigenErf);
 INSTANTIATION(EigenErfGrad);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/erf.cu b/paddle/pten/kernels/funcs/eigen/erf.cu
similarity index 77%
rename from paddle/fluid/operators/eigen/erf.cu
rename to paddle/pten/kernels/funcs/eigen/erf.cu
index 632205bdcbf7efaf6004e071ea078739742a417f..1971f5abbce2de16d36ff15c33bd438fe8eee78b 100644
--- a/paddle/fluid/operators/eigen/erf.cu
+++ b/paddle/pten/kernels/funcs/eigen/erf.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,12 +14,12 @@ limitations under the License. */
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/eigen_ext.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenErf<Eigen::GpuDevice, T> {
@@ -38,7 +38,9 @@ struct EigenErfGrad<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType din, const InType& in,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType din,
+                   const InType& in,
                    const InType& dout) {
     din.device(dev) =
         dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
@@ -48,10 +50,10 @@ struct EigenErfGrad<Eigen::GpuDevice, T> {
 #define INSTANTIATION(FUNCTOR)                       \
   template struct FUNCTOR<Eigen::GpuDevice, float>;  \
   template struct FUNCTOR<Eigen::GpuDevice, double>; \
-  template struct FUNCTOR<Eigen::GpuDevice, platform::float16>
+  template struct FUNCTOR<Eigen::GpuDevice, dtype::float16>
 INSTANTIATION(EigenErf);
 INSTANTIATION(EigenErfGrad);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/pten/kernels/funcs/eigen/extensions.h
similarity index 63%
rename from paddle/fluid/platform/eigen_ext.h
rename to paddle/pten/kernels/funcs/eigen/extensions.h
index 872a6cf062eeff01e2725e8d8ea64058438be114..a67b3268f09469da0d68f2633f32d9ac97bfee9a 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/pten/kernels/funcs/eigen/extensions.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,25 +14,25 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 #include "paddle/pten/core/hostdevice.h"
 
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
 
-using float16 = paddle::platform::float16;
+using float16 = pten::dtype::float16;
 template <typename T>
-using complex = paddle::platform::complex<T>;
+using complex = pten::dtype::complex<T>;
 
 template <typename T>
 struct NumTraits;
 
 template <>
-struct NumTraits<paddle::platform::bfloat16>
-    : GenericNumTraits<paddle::platform::bfloat16> {
+struct NumTraits<pten::dtype::bfloat16>
+    : GenericNumTraits<pten::dtype::bfloat16> {
   enum {
     IsSigned = true,
     IsInteger = false,
@@ -40,23 +40,23 @@ struct NumTraits<paddle::platform::bfloat16>
     RequireInitialization = false
   };
 
-  HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  HOSTDEVICE static inline pten::dtype::bfloat16 epsilon() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x3400);
   }
-  HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() {
-    return paddle::platform::bfloat16(1e-5f);
+  HOSTDEVICE static inline pten::dtype::bfloat16 dummy_precision() {
+    return pten::dtype::bfloat16(1e-5f);
   }
-  HOSTDEVICE static inline paddle::platform::bfloat16 highest() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  HOSTDEVICE static inline pten::dtype::bfloat16 highest() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x7f7f);
   }
-  HOSTDEVICE static inline paddle::platform::bfloat16 lowest() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  HOSTDEVICE static inline pten::dtype::bfloat16 lowest() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xff7f);
   }
-  HOSTDEVICE static inline paddle::platform::bfloat16 infinity() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  HOSTDEVICE static inline pten::dtype::bfloat16 infinity() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x7f80);
   }
-  HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  HOSTDEVICE static inline pten::dtype::bfloat16 quiet_NaN() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xffc1);
   }
 };
 
@@ -114,20 +114,20 @@ struct NumTraits<float16> : GenericNumTraits<float16> {
   };
 
   HOSTDEVICE static inline float16 epsilon() {
-    return paddle::platform::raw_uint16_to_float16(0x0800);
+    return pten::dtype::raw_uint16_to_float16(0x0800);
   }
   HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
   HOSTDEVICE static inline float16 highest() {
-    return paddle::platform::raw_uint16_to_float16(0x7bff);
+    return pten::dtype::raw_uint16_to_float16(0x7bff);
   }
   HOSTDEVICE static inline float16 lowest() {
-    return paddle::platform::raw_uint16_to_float16(0xfbff);
+    return pten::dtype::raw_uint16_to_float16(0xfbff);
   }
   HOSTDEVICE static inline float16 infinity() {
-    return paddle::platform::raw_uint16_to_float16(0x7c00);
+    return pten::dtype::raw_uint16_to_float16(0x7c00);
   }
   HOSTDEVICE static inline float16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7c01);
+    return pten::dtype::raw_uint16_to_float16(0x7c01);
   }
 };
 
@@ -136,96 +136,86 @@ namespace numext {
 //////////// bfloat methods /////////////
 
 template <>
-HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) {
-  return (paddle::platform::isnan)(a);
+HOSTDEVICE inline bool(isnan)(const pten::dtype::bfloat16& a) {
+  return (pten::dtype::isnan)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) {
-  return (paddle::platform::isinf)(a);
+HOSTDEVICE inline bool(isinf)(const pten::dtype::bfloat16& a) {
+  return (pten::dtype::isinf)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) {
-  return (paddle::platform::isfinite)(a);
+HOSTDEVICE inline bool(isfinite)(const pten::dtype::bfloat16& a) {
+  return (pten::dtype::isfinite)(a);
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 exp(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::expf(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 exp(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::expf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 expm1(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::expm1f(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 expm1(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::expm1f(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 erf(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::erff(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 erf(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::erff(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 log(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::logf(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 log(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::logf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 tanh(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::tanhf(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 tanh(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::tanhf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 sqrt(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::sqrtf(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 sqrt(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::sqrtf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 ceil(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::ceilf(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 ceil(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::ceilf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 floor(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::floorf(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 floor(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::floorf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 round(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::roundf(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 round(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::roundf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 pow(
-    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
-  return paddle::platform::bfloat16(
+HOSTDEVICE inline pten::dtype::bfloat16 pow(const pten::dtype::bfloat16& a,
+                                            const pten::dtype::bfloat16& b) {
+  return pten::dtype::bfloat16(
       ::powf(static_cast<float>(a), static_cast<float>(b)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 abs(
-    const paddle::platform::bfloat16& a) {
-  return paddle::platform::bfloat16(::fabs(static_cast<float>(a)));
+HOSTDEVICE inline pten::dtype::bfloat16 abs(const pten::dtype::bfloat16& a) {
+  return pten::dtype::bfloat16(::fabs(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 mini(
-    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+HOSTDEVICE inline pten::dtype::bfloat16 mini(const pten::dtype::bfloat16& a,
+                                             const pten::dtype::bfloat16& b) {
   return b < a ? b : a;
 }
 
 template <>
-HOSTDEVICE inline paddle::platform::bfloat16 maxi(
-    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+HOSTDEVICE inline pten::dtype::bfloat16 maxi(const pten::dtype::bfloat16& a,
+                                             const pten::dtype::bfloat16& b) {
   return a < b ? b : a;
 }
 
@@ -233,17 +223,17 @@ HOSTDEVICE inline paddle::platform::bfloat16 maxi(
 
 template <>
 HOSTDEVICE inline bool(isnan)(const complex<float>& a) {
-  return (paddle::platform::isnan)(a);
+  return (pten::dtype::isnan)(a);
 }
 
 template <>
 HOSTDEVICE inline bool(isinf)(const complex<float>& a) {
-  return (paddle::platform::isinf)(a);
+  return (pten::dtype::isinf)(a);
 }
 
 template <>
 HOSTDEVICE inline bool(isfinite)(const complex<float>& a) {
-  return (paddle::platform::isfinite)(a);
+  return (pten::dtype::isfinite)(a);
 }
 
 template <>
@@ -256,17 +246,17 @@ HOSTDEVICE inline complex<float> exp(const complex<float>& a) {
 
 template <>
 HOSTDEVICE inline complex<float> log(const complex<float>& a) {
-  return paddle::platform::log(a);
+  return pten::dtype::log(a);
 }
 
 template <>
 HOSTDEVICE inline complex<float> tanh(const complex<float>& a) {
-  return paddle::platform::tanh(a);
+  return pten::dtype::tanh(a);
 }
 
 template <>
 HOSTDEVICE inline complex<float> sqrt(const complex<float>& a) {
-  return paddle::platform::sqrt(a);
+  return pten::dtype::sqrt(a);
 }
 
 template <>
@@ -287,29 +277,29 @@ HOSTDEVICE inline complex<float> round(const complex<float>& a) {
 template <>
 HOSTDEVICE inline complex<float> pow(const complex<float>& a,
                                      const complex<float>& b) {
-  return paddle::platform::pow(a, b);
+  return pten::dtype::pow(a, b);
 }
 
 template <>
 HOSTDEVICE inline float abs(const complex<float>& a) {
-  return paddle::platform::abs(a);
+  return pten::dtype::abs(a);
 }
 
 //////////// complex<double> methods /////////////
 
 template <>
 HOSTDEVICE inline bool(isnan)(const complex<double>& a) {
-  return (paddle::platform::isnan)(a);
+  return (pten::dtype::isnan)(a);
 }
 
 template <>
 HOSTDEVICE inline bool(isinf)(const complex<double>& a) {
-  return (paddle::platform::isinf)(a);
+  return (pten::dtype::isinf)(a);
 }
 
 template <>
 HOSTDEVICE inline bool(isfinite)(const complex<double>& a) {
-  return (paddle::platform::isfinite)(a);
+  return (pten::dtype::isfinite)(a);
 }
 
 template <>
@@ -322,17 +312,17 @@ HOSTDEVICE inline complex<double> exp(const complex<double>& a) {
 
 template <>
 HOSTDEVICE inline complex<double> log(const complex<double>& a) {
-  return paddle::platform::log(a);
+  return pten::dtype::log(a);
 }
 
 template <>
 HOSTDEVICE inline complex<double> tanh(const complex<double>& a) {
-  return paddle::platform::tanh(a);
+  return pten::dtype::tanh(a);
 }
 
 template <>
 HOSTDEVICE inline complex<double> sqrt(const complex<double>& a) {
-  return paddle::platform::sqrt(a);
+  return pten::dtype::sqrt(a);
 }
 
 template <>
@@ -353,29 +343,29 @@ HOSTDEVICE inline complex<double> round(const complex<double>& a) {
 template <>
 HOSTDEVICE inline complex<double> pow(const complex<double>& a,
                                       const complex<double>& b) {
-  return paddle::platform::pow(a, b);
+  return pten::dtype::pow(a, b);
 }
 
 template <>
 HOSTDEVICE inline double abs(const complex<double>& a) {
-  return paddle::platform::abs(a);
+  return pten::dtype::abs(a);
 }
 
 //////////// float16 methods /////////////
 
 template <>
 HOSTDEVICE inline bool(isnan)(const float16& a) {
-  return (paddle::platform::isnan)(a);
+  return (pten::dtype::isnan)(a);
 }
 
 template <>
 HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (paddle::platform::isinf)(a);
+  return (pten::dtype::isinf)(a);
 }
 
 template <>
 HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return (paddle::platform::isfinite)(a);
+  return (pten::dtype::isfinite)(a);
 }
 
 template <>
diff --git a/paddle/fluid/operators/eigen/l1_norm.cc b/paddle/pten/kernels/funcs/eigen/l1_norm.cc
similarity index 62%
rename from paddle/fluid/operators/eigen/l1_norm.cc
rename to paddle/pten/kernels/funcs/eigen/l1_norm.cc
index e7ed60f76662eb7907f4884d93149f6f49bc0bc8..37d06b7879ff0eede34139ee5050e435738e722b 100644
--- a/paddle/fluid/operators/eigen/l1_norm.cc
+++ b/paddle/pten/kernels/funcs/eigen/l1_norm.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,18 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenL1Norm<Eigen::DefaultDevice, T> {
   using InType = Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<T,
+                                                          Eigen::Sizes<>,
+                                                          Eigen::RowMajor,
+                                                          Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
                    const InType& in) {
     out.device(dev) = in.abs().sum();
   }
@@ -35,8 +38,11 @@ struct EigenL1NormGrad<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType din,
-                   const InType& dout, const InType& in, const Array& bcast) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType din,
+                   const InType& dout,
+                   const InType& in,
+                   const Array& bcast) {
     din.device(dev) = dout.broadcast(bcast) * in.sign();
   }
 };
@@ -44,5 +50,5 @@ struct EigenL1NormGrad<Eigen::DefaultDevice, T> {
 template struct EigenL1Norm<Eigen::DefaultDevice, float>;
 template struct EigenL1NormGrad<Eigen::DefaultDevice, float>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/l1_norm.cu b/paddle/pten/kernels/funcs/eigen/l1_norm.cu
similarity index 65%
rename from paddle/fluid/operators/eigen/l1_norm.cu
rename to paddle/pten/kernels/funcs/eigen/l1_norm.cu
index a27cd7ae6b7898d8d7fe4001cdfd447d02e19cb7..00156597dc538d0c8f5bbcf2ac6844b1ef85361e 100644
--- a/paddle/fluid/operators/eigen/l1_norm.cu
+++ b/paddle/pten/kernels/funcs/eigen/l1_norm.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,17 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenL1Norm<Eigen::GpuDevice, T> {
   using InType = Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<T,
+                                                          Eigen::Sizes<>,
+                                                          Eigen::RowMajor,
+                                                          Eigen::DenseIndex>>;
   static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in) {
     out.device(dev) = in.abs().sum();
   }
@@ -34,8 +36,11 @@ struct EigenL1NormGrad<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType din, const InType& dout,
-                   const InType& in, const Array& bcast) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType din,
+                   const InType& dout,
+                   const InType& in,
+                   const Array& bcast) {
     din.device(dev) = dout.broadcast(bcast) * in.sign();
   }
 };
@@ -43,5 +48,5 @@ struct EigenL1NormGrad<Eigen::GpuDevice, T> {
 template struct EigenL1Norm<Eigen::GpuDevice, float>;
 template struct EigenL1NormGrad<Eigen::GpuDevice, float>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/loss.cc b/paddle/pten/kernels/funcs/eigen/loss.cc
similarity index 69%
rename from paddle/fluid/operators/eigen/loss.cc
rename to paddle/pten/kernels/funcs/eigen/loss.cc
index 469456537d9aa20564cf9abe2bf1ece735534be3..2a762fadd8110748692e30bc82f04f2fdbf9253d 100644
--- a/paddle/fluid/operators/eigen/loss.cc
+++ b/paddle/pten/kernels/funcs/eigen/loss.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenRankLoss<Eigen::DefaultDevice, T> {
@@ -22,8 +22,10 @@ struct EigenRankLoss<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& label, const InType& left,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& label,
+                   const InType& left,
                    const InType& right) {
     out.device(dev) =
         (1.0f + (left - right).exp()).log() - label * (left - right);
@@ -37,15 +39,21 @@ struct EigenRankLossGrad<Eigen::DefaultDevice, T> {
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
 
-  static void EvalLeft(const Eigen::DefaultDevice& dev, OutType dleft,
-                       const InType& dout, const InType& label,
-                       const InType& left, const InType& right) {
+  static void EvalLeft(const Eigen::DefaultDevice& dev,
+                       OutType dleft,
+                       const InType& dout,
+                       const InType& label,
+                       const InType& left,
+                       const InType& right) {
     dleft.device(dev) = dout * (1.0f / (1.0f + (right - left).exp()) - label);
   }
 
-  static void EvalRight(const Eigen::DefaultDevice& dev, OutType dright,
-                        const InType& dout, const InType& label,
-                        const InType& left, const InType& right) {
+  static void EvalRight(const Eigen::DefaultDevice& dev,
+                        OutType dright,
+                        const InType& dout,
+                        const InType& label,
+                        const InType& left,
+                        const InType& right) {
     dright.device(dev) = -dout * (1.0f / (1.0f + (right - left).exp()) - label);
   }
 };
@@ -59,8 +67,11 @@ struct EigenLogLoss<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& pred, const InType& label, const T& epsilon) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& pred,
+                   const InType& label,
+                   const T& epsilon) {
     out.device(dev) = (-(label * (pred + epsilon).log()) -
                        ((static_cast<T>(1) - label) *
                         (static_cast<T>(1) - pred + epsilon).log()));
@@ -73,8 +84,11 @@ struct EigenLogLossGrad<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType dpred,
-                   const InType& dloss, const InType& pred, const InType& label,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType dpred,
+                   const InType& dloss,
+                   const InType& pred,
+                   const InType& label,
                    const T& epsilon) {
     dpred.device(dev) =
         dloss *
@@ -92,8 +106,10 @@ struct EigenHingeLoss<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType loss,
-                   const InType& pred, const InType& label) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType loss,
+                   const InType& pred,
+                   const InType& label) {
     loss.device(dev) = (static_cast<T>(1) -
                         pred * (static_cast<T>(2) * label - static_cast<T>(1)))
                            .cwiseMax(static_cast<T>(0));
@@ -106,8 +122,10 @@ struct EigenHingeLossGrad<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType dpred,
-                   const InType& dloss, const InType& pred,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType dpred,
+                   const InType& dloss,
+                   const InType& pred,
                    const InType& label) {
     auto alt_labels = static_cast<T>(2) * label - static_cast<T>(1);
     dpred.device(dev) =
@@ -119,5 +137,5 @@ struct EigenHingeLossGrad<Eigen::DefaultDevice, T> {
 template struct EigenHingeLoss<Eigen::DefaultDevice, float>;
 template struct EigenHingeLossGrad<Eigen::DefaultDevice, float>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/loss.cu b/paddle/pten/kernels/funcs/eigen/loss.cu
similarity index 69%
rename from paddle/fluid/operators/eigen/loss.cu
rename to paddle/pten/kernels/funcs/eigen/loss.cu
index 02341202a2b4f18acc79f7bd4d4c69a69a039eca..484bae4c224e276e35cdce1dec141abe4e30ee5a 100644
--- a/paddle/fluid/operators/eigen/loss.cu
+++ b/paddle/pten/kernels/funcs/eigen/loss.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenRankLoss<Eigen::GpuDevice, T> {
@@ -22,8 +22,10 @@ struct EigenRankLoss<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType out,
-                   const InType& label, const InType& left,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& label,
+                   const InType& left,
                    const InType& right) {
     out.device(dev) =
         (1.0f + (left - right).exp()).log() - label * (left - right);
@@ -37,15 +39,21 @@ struct EigenRankLossGrad<Eigen::GpuDevice, T> {
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
 
-  static void EvalLeft(const Eigen::GpuDevice& dev, OutType dleft,
-                       const InType& dout, const InType& label,
-                       const InType& left, const InType& right) {
+  static void EvalLeft(const Eigen::GpuDevice& dev,
+                       OutType dleft,
+                       const InType& dout,
+                       const InType& label,
+                       const InType& left,
+                       const InType& right) {
     dleft.device(dev) = dout * (1.0f / (1.0f + (right - left).exp()) - label);
   }
 
-  static void EvalRight(const Eigen::GpuDevice& dev, OutType dright,
-                        const InType& dout, const InType& label,
-                        const InType& left, const InType& right) {
+  static void EvalRight(const Eigen::GpuDevice& dev,
+                        OutType dright,
+                        const InType& dout,
+                        const InType& label,
+                        const InType& left,
+                        const InType& right) {
     dright.device(dev) = -dout * (1.0f / (1.0f + (right - left).exp()) - label);
   }
 };
@@ -59,8 +67,11 @@ struct EigenLogLoss<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& pred,
-                   const InType& label, const T& epsilon) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& pred,
+                   const InType& label,
+                   const T& epsilon) {
     out.device(dev) = (-(label * (pred + epsilon).log()) -
                        ((static_cast<T>(1) - label) *
                         (static_cast<T>(1) - pred + epsilon).log()));
@@ -73,8 +84,11 @@ struct EigenLogLossGrad<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType dpred,
-                   const InType& dloss, const InType& pred, const InType& label,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType dpred,
+                   const InType& dloss,
+                   const InType& pred,
+                   const InType& label,
                    const T& epsilon) {
     dpred.device(dev) =
         dloss *
@@ -92,8 +106,10 @@ struct EigenHingeLoss<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType loss,
-                   const InType& pred, const InType& label) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType loss,
+                   const InType& pred,
+                   const InType& label) {
     loss.device(dev) = (static_cast<T>(1) -
                         pred * (static_cast<T>(2) * label - static_cast<T>(1)))
                            .cwiseMax(static_cast<T>(0));
@@ -106,8 +122,10 @@ struct EigenHingeLossGrad<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType dpred,
-                   const InType& dloss, const InType& pred,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType dpred,
+                   const InType& dloss,
+                   const InType& pred,
                    const InType& label) {
     auto alt_labels = static_cast<T>(2) * label - static_cast<T>(1);
     dpred.device(dev) =
@@ -119,5 +137,5 @@ struct EigenHingeLossGrad<Eigen::GpuDevice, T> {
 template struct EigenHingeLoss<Eigen::GpuDevice, float>;
 template struct EigenHingeLossGrad<Eigen::GpuDevice, float>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/pad.cc b/paddle/pten/kernels/funcs/eigen/pad.cc
similarity index 73%
rename from paddle/fluid/operators/eigen/pad.cc
rename to paddle/pten/kernels/funcs/eigen/pad.cc
index 9db4571357a78781669951d4c672344d2555cde4..ed4b1c0643a906e592a66f31b75f3e5f97001ea9 100644
--- a/paddle/fluid/operators/eigen/pad.cc
+++ b/paddle/pten/kernels/funcs/eigen/pad.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenPad<Eigen::DefaultDevice, T, Rank> {
@@ -32,13 +32,18 @@ struct EigenPad<Eigen::DefaultDevice, T, Rank> {
       Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
                        Eigen::Aligned>;
 
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& in, const Array& padding, const T value) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& padding,
+                   const T value) {
     out.device(dev) = in.pad(padding, value);
   }
 
-  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
-                   const InType32BitIndex& in, const Array32Bit& padding,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType32BitIndex out,
+                   const InType32BitIndex& in,
+                   const Array32Bit& padding,
                    const T value) {
     out.device(dev) = in.pad(padding, value);
   }
@@ -56,9 +61,9 @@ INSTANTIATION(EigenPad, int);
 INSTANTIATION(EigenPad, int64_t);
 INSTANTIATION(EigenPad, float);
 INSTANTIATION(EigenPad, double);
-INSTANTIATION(EigenPad, platform::complex<float>);
-INSTANTIATION(EigenPad, platform::complex<double>);
+INSTANTIATION(EigenPad, dtype::complex<float>);
+INSTANTIATION(EigenPad, dtype::complex<double>);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/pad.cu b/paddle/pten/kernels/funcs/eigen/pad.cu
similarity index 68%
rename from paddle/fluid/operators/eigen/pad.cu
rename to paddle/pten/kernels/funcs/eigen/pad.cu
index e028a8aef18cfc62c1541cc1931f95b772df8768..6d40adce877173cd9c00fb5b5cd8faa47a483ccc 100644
--- a/paddle/fluid/operators/eigen/pad.cu
+++ b/paddle/pten/kernels/funcs/eigen/pad.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenPad<Eigen::GpuDevice, T, Rank> {
@@ -34,13 +34,18 @@ struct EigenPad<Eigen::GpuDevice, T, Rank> {
       Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
                        Eigen::Aligned>;
 
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
-                   const Array& padding, const T value) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& padding,
+                   const T value) {
     out.device(dev) = in.pad(padding, value);
   }
 
-  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
-                   const InType32BitIndex& in, const Array32Bit& padding,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType32BitIndex out,
+                   const InType32BitIndex& in,
+                   const Array32Bit& padding,
                    const T value) {
     out.device(dev) = in.pad(padding, value);
   }
@@ -58,11 +63,11 @@ INSTANTIATION(EigenPad, int);
 INSTANTIATION(EigenPad, int64_t);
 INSTANTIATION(EigenPad, float);
 INSTANTIATION(EigenPad, double);
-INSTANTIATION(EigenPad, platform::float16);
-INSTANTIATION(EigenPad, platform::bfloat16);
-INSTANTIATION(EigenPad, platform::complex<float>);
-INSTANTIATION(EigenPad, platform::complex<double>);
+INSTANTIATION(EigenPad, dtype::float16);
+INSTANTIATION(EigenPad, dtype::bfloat16);
+INSTANTIATION(EigenPad, dtype::complex<float>);
+INSTANTIATION(EigenPad, dtype::complex<double>);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/reverse.cc b/paddle/pten/kernels/funcs/eigen/reverse.cc
similarity index 81%
rename from paddle/fluid/operators/eigen/reverse.cc
rename to paddle/pten/kernels/funcs/eigen/reverse.cc
index 02044479db952ff27c06148ca39c4a2a3e36330a..9e77ae4fba529d8d96dabb147060339141a2451e 100644
--- a/paddle/fluid/operators/eigen/reverse.cc
+++ b/paddle/pten/kernels/funcs/eigen/reverse.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenReverse<Eigen::DefaultDevice, T, Rank> {
@@ -23,8 +23,10 @@ struct EigenReverse<Eigen::DefaultDevice, T, Rank> {
       Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType = Eigen::TensorMap<
       Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& in, const Array& reverse) {
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& reverse) {
     out.device(dev) = in.reverse(reverse);
   }
 };
@@ -44,5 +46,5 @@ INSTANTIATION(EigenReverse, float);
 INSTANTIATION(EigenReverse, double);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/reverse.cu b/paddle/pten/kernels/funcs/eigen/reverse.cu
similarity index 83%
rename from paddle/fluid/operators/eigen/reverse.cu
rename to paddle/pten/kernels/funcs/eigen/reverse.cu
index 9b769489ce723678b2cc1440bf6c3d374e3a55d6..fc6c3b29ab86ed088e5270be977fd65e87b87ac7 100644
--- a/paddle/fluid/operators/eigen/reverse.cu
+++ b/paddle/pten/kernels/funcs/eigen/reverse.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenReverse<Eigen::GpuDevice, T, Rank> {
@@ -23,7 +23,9 @@ struct EigenReverse<Eigen::GpuDevice, T, Rank> {
       Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType = Eigen::TensorMap<
       Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& in,
                    const Array& reverse) {
     out.device(dev) = in.reverse(reverse);
   }
@@ -44,5 +46,5 @@ INSTANTIATION(EigenReverse, float);
 INSTANTIATION(EigenReverse, double);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/scale.cc b/paddle/pten/kernels/funcs/eigen/scale.cc
similarity index 67%
rename from paddle/fluid/operators/eigen/scale.cc
rename to paddle/pten/kernels/funcs/eigen/scale.cc
index d9fbb878e35ea77638aad7ef98e20979852a5881..cd34cd6e700ea8fa84c29f80e1c76b51f7612dad 100644
--- a/paddle/fluid/operators/eigen/scale.cc
+++ b/paddle/pten/kernels/funcs/eigen/scale.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,12 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenScale<Eigen::DefaultDevice, T> {
@@ -24,8 +24,11 @@ struct EigenScale<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& in, const T scale, const T bias,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T scale,
+                   const T bias,
                    const bool bias_after_scale) {
     if (bias_after_scale) {
       out.device(dev) = scale * in + bias;
@@ -37,14 +40,14 @@ struct EigenScale<Eigen::DefaultDevice, T> {
 
 template struct EigenScale<Eigen::DefaultDevice, float>;
 template struct EigenScale<Eigen::DefaultDevice, double>;
-template struct EigenScale<Eigen::DefaultDevice, platform::bfloat16>;
+template struct EigenScale<Eigen::DefaultDevice, dtype::bfloat16>;
 template struct EigenScale<Eigen::DefaultDevice, uint8_t>;
 template struct EigenScale<Eigen::DefaultDevice, int8_t>;
 template struct EigenScale<Eigen::DefaultDevice, int16_t>;
 template struct EigenScale<Eigen::DefaultDevice, int>;
 template struct EigenScale<Eigen::DefaultDevice, int64_t>;
-template struct EigenScale<Eigen::DefaultDevice, platform::complex<float>>;
-template struct EigenScale<Eigen::DefaultDevice, platform::complex<double>>;
+template struct EigenScale<Eigen::DefaultDevice, dtype::complex<float>>;
+template struct EigenScale<Eigen::DefaultDevice, dtype::complex<double>>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/scale.cu b/paddle/pten/kernels/funcs/eigen/scale.cu
similarity index 64%
rename from paddle/fluid/operators/eigen/scale.cu
rename to paddle/pten/kernels/funcs/eigen/scale.cu
index 5e485799af52c674c37a781ccac534e9b1083014..f1cbbd6a9bc78b1e1ac8a81f2c75ddba026ea402 100644
--- a/paddle/fluid/operators/eigen/scale.cu
+++ b/paddle/pten/kernels/funcs/eigen/scale.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,12 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenScale<Eigen::GpuDevice, T> {
@@ -24,8 +24,12 @@ struct EigenScale<Eigen::GpuDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
-                   const T scale, const T bias, const bool bias_after_scale) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T scale,
+                   const T bias,
+                   const bool bias_after_scale) {
     if (bias_after_scale) {
       out.device(dev) = scale * in + bias;
     } else {
@@ -41,9 +45,9 @@ template struct EigenScale<Eigen::GpuDevice, int8_t>;
 template struct EigenScale<Eigen::GpuDevice, int16_t>;
 template struct EigenScale<Eigen::GpuDevice, int>;
 template struct EigenScale<Eigen::GpuDevice, int64_t>;
-template struct EigenScale<Eigen::GpuDevice, platform::float16>;
-template struct EigenScale<Eigen::GpuDevice, platform::complex<float>>;
-template struct EigenScale<Eigen::GpuDevice, platform::complex<double>>;
+template struct EigenScale<Eigen::GpuDevice, dtype::float16>;
+template struct EigenScale<Eigen::GpuDevice, dtype::complex<float>>;
+template struct EigenScale<Eigen::GpuDevice, dtype::complex<double>>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/sign.cc b/paddle/pten/kernels/funcs/eigen/sign.cc
similarity index 77%
rename from paddle/fluid/operators/eigen/sign.cc
rename to paddle/pten/kernels/funcs/eigen/sign.cc
index 4a4445f6569d388a4181eec1bed2faf190aeb729..99ecfadea35aa5208b8d33eb9842395bb24e4e96 100644
--- a/paddle/fluid/operators/eigen/sign.cc
+++ b/paddle/pten/kernels/funcs/eigen/sign.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenSign<Eigen::DefaultDevice, T> {
@@ -22,7 +22,8 @@ struct EigenSign<Eigen::DefaultDevice, T> {
       Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
   using OutType =
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
                    const InType& in) {
     out.device(dev) = in.sign();
   }
@@ -31,5 +32,5 @@ struct EigenSign<Eigen::DefaultDevice, T> {
 template struct EigenSign<Eigen::DefaultDevice, float>;
 template struct EigenSign<Eigen::DefaultDevice, double>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/sign.cu b/paddle/pten/kernels/funcs/eigen/sign.cu
similarity index 73%
rename from paddle/fluid/operators/eigen/sign.cu
rename to paddle/pten/kernels/funcs/eigen/sign.cu
index 52c8d3c80dd2c5d0d64e9a92ae596d7b69e70476..aad73c917ccdf209e3f0aad0a4804ba33e7569f8 100644
--- a/paddle/fluid/operators/eigen/sign.cu
+++ b/paddle/pten/kernels/funcs/eigen/sign.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,12 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/eigen_ext.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T>
 struct EigenSign<Eigen::GpuDevice, T> {
@@ -31,7 +31,7 @@ struct EigenSign<Eigen::GpuDevice, T> {
 
 template struct EigenSign<Eigen::GpuDevice, float>;
 template struct EigenSign<Eigen::GpuDevice, double>;
-template struct EigenSign<Eigen::GpuDevice, platform::float16>;
+template struct EigenSign<Eigen::GpuDevice, dtype::float16>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/slice.cc b/paddle/pten/kernels/funcs/eigen/slice.cc
similarity index 72%
rename from paddle/fluid/operators/eigen/slice.cc
rename to paddle/pten/kernels/funcs/eigen/slice.cc
index 2579b5f07eb27817f5488d8065fa05f409d1163f..e838137a76179fe2e17b6aa766684fbbec40c2ca 100644
--- a/paddle/fluid/operators/eigen/slice.cc
+++ b/paddle/pten/kernels/funcs/eigen/slice.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenSlice<Eigen::DefaultDevice, T, Rank> {
@@ -34,14 +34,18 @@ struct EigenSlice<Eigen::DefaultDevice, T, Rank> {
       Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
                        Eigen::Aligned>;
 
-  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
-                   const InType& in, const Array& offsets,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& offsets,
                    const Array& extents) {
     out.device(dev) = in.slice(offsets, extents);
   }
 
-  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
-                   const InType32BitIndex& in, const Array32Bit& offsets,
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType32BitIndex out,
+                   const InType32BitIndex& in,
+                   const Array32Bit& offsets,
                    const Array32Bit& extents) {
     out.device(dev) = in.slice(offsets, extents);
   }
@@ -65,11 +69,11 @@ INSTANTIATION(EigenSlice, int16_t);
 INSTANTIATION(EigenSlice, int64_t);
 INSTANTIATION(EigenSlice, float);
 INSTANTIATION(EigenSlice, double);
-INSTANTIATION(EigenSlice, platform::float16);
-INSTANTIATION(EigenSlice, platform::bfloat16);
-INSTANTIATION(EigenSlice, platform::complex<float>);
-INSTANTIATION(EigenSlice, platform::complex<double>);
+INSTANTIATION(EigenSlice, dtype::float16);
+INSTANTIATION(EigenSlice, dtype::bfloat16);
+INSTANTIATION(EigenSlice, dtype::complex<float>);
+INSTANTIATION(EigenSlice, dtype::complex<double>);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/eigen/slice.cu b/paddle/pten/kernels/funcs/eigen/slice.cu
similarity index 68%
rename from paddle/fluid/operators/eigen/slice.cu
rename to paddle/pten/kernels/funcs/eigen/slice.cu
index 3dfd0500cc954f3990ed12d2be5b1a653c733d74..b68a556f9c30454a7f073e202f5d0b1ecd9f6474 100644
--- a/paddle/fluid/operators/eigen/slice.cu
+++ b/paddle/pten/kernels/funcs/eigen/slice.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
+namespace pten {
+namespace funcs {
 
 template <typename T, int Rank>
 struct EigenSlice<Eigen::GpuDevice, T, Rank> {
@@ -34,13 +34,18 @@ struct EigenSlice<Eigen::GpuDevice, T, Rank> {
       Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
                        Eigen::Aligned>;
 
-  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
-                   const Array& offsets, const Array& extents) {
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const Array& offsets,
+                   const Array& extents) {
     out.device(dev) = in.slice(offsets, extents);
   }
 
-  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
-                   const InType32BitIndex& in, const Array32Bit& offsets,
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType32BitIndex out,
+                   const InType32BitIndex& in,
+                   const Array32Bit& offsets,
                    const Array32Bit& extents) {
     out.device(dev) = in.slice(offsets, extents);
   }
@@ -58,11 +63,11 @@ INSTANTIATION(EigenSlice, int);
 INSTANTIATION(EigenSlice, int64_t);
 INSTANTIATION(EigenSlice, float);
 INSTANTIATION(EigenSlice, double);
-INSTANTIATION(EigenSlice, platform::float16);
-INSTANTIATION(EigenSlice, platform::bfloat16);
-INSTANTIATION(EigenSlice, platform::complex<float>);
-INSTANTIATION(EigenSlice, platform::complex<double>);
+INSTANTIATION(EigenSlice, dtype::float16);
+INSTANTIATION(EigenSlice, dtype::bfloat16);
+INSTANTIATION(EigenSlice, dtype::complex<float>);
+INSTANTIATION(EigenSlice, dtype::complex<double>);
 #undef INSTANTIATION
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 47924c4e2ae189d93bda139fc4d325d8ff7f9529..1c18e9f7998adc777c1f267ecf66ba1ad673112b 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -227,7 +227,7 @@ class TransformFunctor {
                    const bool is_xsize_larger = true)
       : x_(x.data<T>()),
         y_(y.data<T>()),
-        z_(z->mutable_data<OutType>()),
+        z_(z->mutable_data<OutType>(ctx.GetPlace())),
         nx_(x.numel()),
         ctx_(ctx),
         func_(func),
@@ -585,7 +585,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
     ins_data[i] = ins[i]->data<InT>();
   }
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = (*outs)[i]->mutable_data<OutT>();
+    outs_data[i] = (*outs)[i]->mutable_data<OutT>(ctx.GetPlace());
   }
 #ifdef PADDLE_WITH_XPU2
   int block_size = 64;
diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc
index 90a6859a850910d0daebf586e6eede0febf12fe1..13cfaedb33d38ee2bb6052ea622fc59b659f581a 100644
--- a/paddle/pten/kernels/funcs/transpose.cc
+++ b/paddle/pten/kernels/funcs/transpose.cc
@@ -36,7 +36,7 @@ struct TransposeNormal<CPUContext, T> {
     auto in_stride = pten::framework::stride(in.dims());
     auto out_stride = pten::framework::stride(out->dims());
     const T* in_ptr = in.data<T>();
-    T* out_ptr = out->mutable_data<T>();
+    T* out_ptr = out->mutable_data<T>(dev_ctx.GetPlace());
 
     auto transpose_helper = [&](int64_t beg, int64_t end) {
       for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
@@ -63,11 +63,8 @@ DEFINE_CPU_TRANS_NORMAL(bool);
 DEFINE_CPU_TRANS_NORMAL(int8_t);
 DEFINE_CPU_TRANS_NORMAL(uint8_t);
 DEFINE_CPU_TRANS_NORMAL(int16_t);
-DEFINE_CPU_TRANS_NORMAL(uint16_t);
 DEFINE_CPU_TRANS_NORMAL(int32_t);
-DEFINE_CPU_TRANS_NORMAL(uint32_t);
 DEFINE_CPU_TRANS_NORMAL(int64_t);
-DEFINE_CPU_TRANS_NORMAL(uint64_t);
 DEFINE_CPU_TRANS_NORMAL(float);
 DEFINE_CPU_TRANS_NORMAL(double);
 DEFINE_CPU_TRANS_NORMAL(paddle::platform::float16);
diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
index 474a7c4ea4de9254ae3b028cc925a5154d8d8787..24d72ca3d81ce455bcaee2d9d82261707674fb2c 100644
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -61,7 +61,7 @@ struct TransposeNormal<GPUContext, T> {
     auto in_stride = pten::framework::stride(in.dims());
     auto out_stride = pten::framework::stride(out->dims());
     auto* in_ptr = in.data<T>();
-    auto* out_ptr = out->mutable_data<T>();
+    auto* out_ptr = out->mutable_data<T>(dev_ctx.GetPlace());
 
     // copy in_stride, out_stride, axis to gpu device
     const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace();
@@ -110,11 +110,8 @@ DEFINE_GPU_TRANS_NORMAL(bool);
 DEFINE_GPU_TRANS_NORMAL(int8_t);
 DEFINE_GPU_TRANS_NORMAL(uint8_t);
 DEFINE_GPU_TRANS_NORMAL(int16_t);
-DEFINE_GPU_TRANS_NORMAL(uint16_t);
 DEFINE_GPU_TRANS_NORMAL(int32_t);
-DEFINE_GPU_TRANS_NORMAL(uint32_t);
 DEFINE_GPU_TRANS_NORMAL(int64_t);
-DEFINE_GPU_TRANS_NORMAL(uint64_t);
 DEFINE_GPU_TRANS_NORMAL(float);
 DEFINE_GPU_TRANS_NORMAL(double);
 DEFINE_GPU_TRANS_NORMAL(paddle::platform::float16);
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 3774c56370b45f1fe92cb6d9e73c044cf75f139f..12f246c3238d067a49032f077d472609c570cb93 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -43,7 +43,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
   std::vector<DenseTensor*> outputs;
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
-  out->mutable_data<OutT>();
+  out->mutable_data<OutT>(dev_ctx.GetPlace());
   pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
                                                    InT,
                                                    OutT>(
diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
index 1f7a08e8254e26c18cb764746d2712d3bf71c2be..d2578723158317f485f0e93a1a5b93477db5df04 100644
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -43,7 +43,7 @@ void Copy(const Context& dev_ctx,
           << dst_place;
 
   dst->ResizeAndAllocate(src.dims());
-  auto* dst_ptr = dst->mutable_data();
+  auto* dst_ptr = dst->mutable_data(dst_place);
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
index 5fe397e1283bd8d10af4e32f672589e1b29577e7..75aacc8d3d1179861526a68bc9a30cb27340adf8 100644
--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const DenseTensor& y,
                DenseTensor* out) {
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
   if (1 == out->dims().size()) {
     auto eigen_out = pten::EigenScalar<T>::From(*out);
     auto eigen_x = pten::EigenVector<T>::Flatten(x);
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index def54e24840e7973f94c1bfce447327523984a55..f4d8e442fcdebfe76cfa89df82d3132a7a65fae4 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -350,7 +350,7 @@ void LaunchKernel(const KPDevice &ctx,
   pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = (*outs)[i]->mutable_data<OutT>();
+    outs_data[i] = (*outs)[i]->mutable_data<OutT>(ctx.GetPlace());
   }
 
   for (int i = 0; i < Arity; i++) {
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 6b6383f81065bfa85d5288298758bfb9720ee7f8..d06dc1c43f6d41bc988283b2cfa9be072f4a69c8 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -47,7 +47,7 @@ namespace pten {
     inputs.emplace_back(&x);                                         \
     inputs.emplace_back(&y);                                         \
     outputs.emplace_back(out);                                       \
-    out->mutable_data<T>();                                          \
+    out->mutable_data<T>(dev_ctx.GetPlace());                        \
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(     \
         dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
   }
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index e247f786cc68d84fd6434695a9bf85ea6fabcad0..26f17bc00507e8bc401a50942ce951710b120d64 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -328,7 +328,7 @@ struct ReduceConfig {
     if (should_reduce_again) {
       tmp->ResizeAndAllocate(pten::framework::make_ddim(
           {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
-      output_data = tmp->mutable_data<Ty>();
+      output_data = tmp->mutable_data<Ty>(place);
     } else {
       output_data = y_data;
     }
@@ -1032,7 +1032,7 @@ static
                             pten::framework::make_ddim(
                                 {static_cast<int64_t>(temp_storage_bytes)})));
 
-  auto* temp_storage = tmp.mutable_data<uint8_t>();
+  auto* temp_storage = tmp.mutable_data<uint8_t>(place);
 
   cub::DeviceReduce::Reduce(temp_storage,
                             temp_storage_bytes,
@@ -1070,8 +1070,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
                              const TransformOp& transform,
                              const std::vector<int>& origin_reduce_dims,
                              gpuStream_t stream) {
-  // Allocate memory
-  y->mutable_data<Ty>();
+  y->mutable_data<Ty>(x.place());
 
   auto x_dim = pten::framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
@@ -1088,7 +1087,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
       pten::DenseTensorMeta(y->dtype(), tmp_ddim, y->layout()));
 
   auto x_data = x.data<Tx>();
-  auto y_data = y->mutable_data<Ty>();
+  auto y_data = y->data<Ty>();
 
   auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
       paddle::platform::DeviceContextPool::Instance().Get(x.place()));
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index e729dad3b36234b70133701286137671975e10c1..dd7c2f242ea4dbb473702c63716393566cd912c5 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -54,7 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
   std::vector<DenseTensor*> outputs;
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
   pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
                                                    T,
                                                    T>(
diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h
index d7132b05f7f0434a03653f8784ec0739e79be579..aa878f7e9eb7f157ab193ff5129e5752aded67e6 100644
--- a/paddle/pten/kernels/impl/complex_kernel_impl.h
+++ b/paddle/pten/kernels/impl/complex_kernel_impl.h
@@ -26,7 +26,7 @@ void ConjKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = out->mutable_data<T>();
+  auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
 
   paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
   paddle::operators::math::ConjFunctor<T> functor(x_data, numel, out_data);
diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
index 557f6fae7b7f98efd17e0447d7c9c13498e420bf..d0c6cf6793e6d37e3aad4a3a601280a9f02d0013 100644
--- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
@@ -73,7 +73,7 @@ struct DotGradFunction<DeviceContext,
       auto dout = EigenMatrix<T>::From(*tensor_dout);
 
       if (tensor_dx) {
-        tensor_dx->mutable_data<T>();
+        tensor_dx->mutable_data<T>(ctx.GetPlace());
         auto y = EigenMatrix<T>::From(*tensor_y);
         auto& dev = *ctx.eigen_device();
         Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
@@ -85,7 +85,7 @@ struct DotGradFunction<DeviceContext,
       }
 
       if (tensor_dy) {
-        tensor_dy->mutable_data<T>();
+        tensor_dy->mutable_data<T>(ctx.GetPlace());
         auto x = EigenMatrix<T>::From(*tensor_x);
         auto& dev = *ctx.eigen_device();
         Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
@@ -100,7 +100,7 @@ struct DotGradFunction<DeviceContext,
     const auto* data_dout = tensor_dout->data<T>();
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>();
+      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
       const auto* data_y = tensor_y->data<T>();
       const DDim& dim = tensor_x->dims();
       size_t N = static_cast<size_t>(pten::framework::product(dim));
@@ -115,7 +115,7 @@ struct DotGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>();
+      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
       const auto* data_x = tensor_x->data<T>();
       const DDim& dim = tensor_y->dims();
       size_t N = static_cast<size_t>(pten::framework::product(dim));
@@ -164,7 +164,7 @@ struct DotGradFunction<DeviceContext,
       auto dout = EigenMatrix<T>::From(*tensor_dout);
 
       if (tensor_dx) {
-        tensor_dx->mutable_data<T>();
+        tensor_dx->mutable_data<T>(ctx.GetPlace());
         auto y = EigenMatrix<T>::From(*tensor_y);
         auto dx = EigenMatrix<T>::From(*tensor_dx);
         auto& dev = *ctx.eigen_device();
@@ -173,7 +173,7 @@ struct DotGradFunction<DeviceContext,
       }
 
       if (tensor_dy) {
-        tensor_dy->mutable_data<T>();
+        tensor_dy->mutable_data<T>(ctx.GetPlace());
         auto x = EigenMatrix<T>::From(*tensor_x);
         auto dy = EigenMatrix<T>::From(*tensor_dy);
         auto& dev = *ctx.eigen_device();
@@ -189,7 +189,7 @@ struct DotGradFunction<DeviceContext,
     auto const B = d[d.size() - 1];
 
     if (tensor_dx) {
-      auto* dx = tensor_dx->mutable_data<T>();
+      auto* dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
       for (auto j = 0; j < N / B; ++j) {
         auto const ss = dz[j];
         for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss;
@@ -197,7 +197,7 @@ struct DotGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* dy = tensor_dy->mutable_data<T>();
+      auto* dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
       for (auto j = 0; j < N / B; ++j) {
         auto const ss = dz[j];
         for (auto i = 0; i < B; i++) *dy++ = *x++ * ss;
@@ -272,7 +272,7 @@ struct DotDoubleGradFunction<DeviceContext,
     const auto* data_dout = tensor_dout->data<T>();
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>();
+      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddy = tensor_ddy->data<T>();
       const DDim& dim = tensor_dx->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -287,7 +287,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>();
+      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddx = tensor_ddx->data<T>();
       const DDim& dim = tensor_dy->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -302,7 +302,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_ddout) {
-      auto* data_ddout = tensor_ddout->mutable_data<T>();
+      auto* data_ddout = tensor_ddout->mutable_data<T>(ctx.GetPlace());
       auto* data_x = tensor_x->data<T>();
       auto* data_y = tensor_y->data<T>();
       auto* data_ddx = tensor_ddx->data<T>();
@@ -351,7 +351,7 @@ struct DotDoubleGradFunction<DeviceContext,
       auto& dev = *ctx.eigen_device();
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
       if (tensor_dx) {
-        tensor_dx->mutable_data<T>();
+        tensor_dx->mutable_data<T>(ctx.GetPlace());
         auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
         Eigen::DSizes<int, 1> size(tensor_ddy->numel());
         auto dx = EigenVector<T>::Flatten(*tensor_dx);
@@ -359,7 +359,7 @@ struct DotDoubleGradFunction<DeviceContext,
       }
 
       if (tensor_dy) {
-        tensor_dy->mutable_data<T>();
+        tensor_dy->mutable_data<T>(ctx.GetPlace());
         auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
         Eigen::DSizes<int, 1> size(tensor_ddx->numel());
 
@@ -368,7 +368,7 @@ struct DotDoubleGradFunction<DeviceContext,
       }
 
       if (tensor_ddout) {
-        tensor_ddout->mutable_data<T>();
+        tensor_ddout->mutable_data<T>(ctx.GetPlace());
         auto x = EigenVector<T>::Flatten(*tensor_x);
         auto y = EigenVector<T>::Flatten(*tensor_y);
         auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
@@ -381,7 +381,7 @@ struct DotDoubleGradFunction<DeviceContext,
     const auto* data_dout = tensor_dout->data<T>();
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>();
+      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddy = tensor_ddy->data<T>();
       const DDim& dim = tensor_dx->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -396,7 +396,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>();
+      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddx = tensor_ddx->data<T>();
       const DDim& dim = tensor_dy->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -411,7 +411,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_ddout) {
-      auto* data_ddout = tensor_ddout->mutable_data<T>();
+      auto* data_ddout = tensor_ddout->mutable_data<T>(ctx.GetPlace());
       auto* data_x = tensor_x->data<T>();
       auto* data_y = tensor_y->data<T>();
       auto* data_ddx = tensor_ddx->data<T>();
@@ -552,7 +552,7 @@ struct DotTripleGradFunction<DeviceContext,
     const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
 
     if (out_tensor_d_x) {
-      auto* data_d_x = out_tensor_d_x->mutable_data<T>();
+      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddy = in_tensor_ddy->data<T>();
 
       const DDim& dim = out_tensor_d_x->dims();
@@ -567,7 +567,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_y) {
-      auto* data_d_y = out_tensor_d_y->mutable_data<T>();
+      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddx = in_tensor_ddx->data<T>();
 
       const DDim& dim = out_tensor_d_y->dims();
@@ -582,7 +582,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_dout) {
-      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>();
+      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
       auto* data_ddx = in_tensor_ddx->data<T>();
       auto* data_ddy = in_tensor_ddy->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
@@ -613,7 +613,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddx) {
-      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>();
+      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dy = in_tensor_d_dy->data<T>();
       auto* data_y = in_tensor_y->data<T>();
@@ -633,7 +633,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddy) {
-      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>();
+      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
       auto* data_x = in_tensor_x->data<T>();
@@ -678,7 +678,7 @@ struct DotTripleGradFunction<DeviceContext,
       auto& dev = *ctx.eigen_device();
       auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
       if (out_tensor_d_x) {
-        out_tensor_d_x->mutable_data<T>();
+        out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
         auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
         Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
         auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
@@ -686,7 +686,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_y) {
-        out_tensor_d_y->mutable_data<T>();
+        out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
         auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
         Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
 
@@ -695,7 +695,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_dout) {
-        out_tensor_d_dout->mutable_data<T>();
+        out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
         auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
         auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
         auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
@@ -705,7 +705,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_ddx) {
-        out_tensor_d_ddx->mutable_data<T>();
+        out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
         auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
         auto y = EigenVector<T>::Flatten(*in_tensor_y);
         auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
@@ -717,7 +717,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_ddy) {
-        out_tensor_d_ddy->mutable_data<T>();
+        out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
         auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
         auto x = EigenVector<T>::Flatten(*in_tensor_x);
         auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
@@ -732,7 +732,7 @@ struct DotTripleGradFunction<DeviceContext,
     const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
 
     if (out_tensor_d_x) {
-      auto* data_d_x = out_tensor_d_x->mutable_data<T>();
+      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddy = in_tensor_ddy->data<T>();
 
       const DDim& dim = out_tensor_d_x->dims();
@@ -747,7 +747,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_y) {
-      auto* data_d_y = out_tensor_d_y->mutable_data<T>();
+      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
       const auto* data_ddx = in_tensor_ddx->data<T>();
 
       const DDim& dim = out_tensor_d_y->dims();
@@ -762,7 +762,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_dout) {
-      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>();
+      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
       auto* data_ddx = in_tensor_ddx->data<T>();
       auto* data_ddy = in_tensor_ddy->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
@@ -790,7 +790,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddx) {
-      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>();
+      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dy = in_tensor_d_dy->data<T>();
       auto* data_y = in_tensor_y->data<T>();
@@ -809,7 +809,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddy) {
-      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>();
+      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
       auto* data_x = in_tensor_x->data<T>();
@@ -838,10 +838,10 @@ void DotGradKernel(const Context& dev_ctx,
                    DenseTensor* dx,
                    DenseTensor* dy) {
   if (dx) {
-    dx->mutable_data<T>();
+    dx->mutable_data<T>(dev_ctx.GetPlace());
   }
   if (dy) {
-    dy->mutable_data<T>();
+    dy->mutable_data<T>(dev_ctx.GetPlace());
   }
   DotGradFunction<Context, T>()(dev_ctx, &x, &y, &dout, dx, dy);
 }
@@ -857,13 +857,13 @@ void DotDoubleGradKernel(const Context& dev_ctx,
                          DenseTensor* dy,
                          DenseTensor* ddout) {
   if (dx) {
-    dx->mutable_data<T>();
+    dx->mutable_data<T>(dev_ctx.GetPlace());
   }
   if (dy) {
-    dy->mutable_data<T>();
+    dy->mutable_data<T>(dev_ctx.GetPlace());
   }
   if (ddout) {
-    ddout->mutable_data<T>();
+    ddout->mutable_data<T>(dev_ctx.GetPlace());
   }
   DotDoubleGradFunction<Context, T>()(
       dev_ctx, &x, &y, &dout, ddx, ddy, dx, dy, ddout);
@@ -885,19 +885,19 @@ void DotTripleGradKernel(const Context& dev_ctx,
                          DenseTensor* d_ddy,
                          DenseTensor* d_dout) {
   if (d_x) {
-    d_x->mutable_data<T>();
+    d_x->mutable_data<T>(dev_ctx.GetPlace());
   }
   if (d_y) {
-    d_y->mutable_data<T>();
+    d_y->mutable_data<T>(dev_ctx.GetPlace());
   }
   if (d_ddx) {
-    d_ddx->mutable_data<T>();
+    d_ddx->mutable_data<T>(dev_ctx.GetPlace());
   }
   if (d_ddy) {
-    d_ddy->mutable_data<T>();
+    d_ddy->mutable_data<T>(dev_ctx.GetPlace());
   }
   if (d_dout) {
-    d_dout->mutable_data<T>();
+    d_dout->mutable_data<T>(dev_ctx.GetPlace());
   }
 
   DotTripleGradFunction<Context, T>()(dev_ctx,
diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h
index 2900e2e83bd659b8fe5b6fbd1af12ec01524d52e..4fee23e175c9ec28a492a3ff2cf37ba5c1234b92 100644
--- a/paddle/pten/kernels/impl/full_kernel_impl.h
+++ b/paddle/pten/kernels/impl/full_kernel_impl.h
@@ -26,7 +26,7 @@ namespace pten {
 
 template <typename T, typename Context, typename VType>
 void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) {
-  tensor->mutable_data<T>();
+  tensor->mutable_data<T>(dev_ctx.GetPlace());
   auto t = pten::EigenVector<T>::Flatten(*tensor);
   t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(val));
 }
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index 71fadfae7deb822d5997491e2eaf8b413a8647fc..fbcb073150cc5ecea4252d755c6f85e677bdf120 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -105,7 +105,7 @@ void MatMul(const Context& dev_ctx,
             bool trans_b,
             DenseTensor* out,
             bool flag = false) {
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
   auto blas = paddle::operators::math::GetBlas<Context, T>(dev_ctx);
   auto mat_dim_a =
       paddle::operators::math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
@@ -123,7 +123,7 @@ void MatMul(const Context& dev_ctx,
               b.data<T>(),
               mat_dim_b,
               static_cast<T>(1),
-              out->mutable_data<T>(),
+              out->data<T>(),
               static_cast<T>(flag));
 }
 
@@ -242,8 +242,8 @@ void MatmulGradKernel(const Context& dev_ctx,
 
   // Case1 : x's or y's dim = 1
   if (x_ndim == 1 && y_ndim == 1) {
-    if (dx) dx->mutable_data<T>();
-    if (dy) dy->mutable_data<T>();
+    if (dx) dx->mutable_data<T>(dev_ctx.GetPlace());
+    if (dy) dy->mutable_data<T>(dev_ctx.GetPlace());
     if (out_grad.numel() == 1) {
       DotGradFunction<Context, T>()(dev_ctx, &x, &y, &out_grad, dx, dy);
       return;
diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h
index afe6bf71e2f6b453031863ff1ac9d67f32f79e65..e59a54c703ab543c5d27db5291b0b2cb9c6ee79b 100644
--- a/paddle/pten/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
@@ -118,7 +118,7 @@ void MatMulFunction(const Context& dev_ctx,
             N));
     VLOG(3) << "MatMul's case 1";
     Out->Resize({1});
-    Out->mutable_data<T>();
+    Out->mutable_data<T>(dev_ctx.GetPlace());
     blas.GEMM(CblasNoTrans,
               CblasTrans,
               1,
@@ -128,7 +128,7 @@ void MatMulFunction(const Context& dev_ctx,
               y_data,
               x_data,
               static_cast<T>(flag),
-              Out->mutable_data<T>());
+              Out->data<T>());
     return;
   }
 
@@ -165,7 +165,7 @@ void MatMulFunction(const Context& dev_ctx,
       out_dims.back() = y_dims.back();
     }
     Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims));
-    Out->mutable_data<T>();
+    Out->mutable_data<T>(dev_ctx.GetPlace());
     if (trans_y) {
       const int M = Y.numel() / N;
       VLOG(3) << "MatMul's case 2";
@@ -176,7 +176,7 @@ void MatMulFunction(const Context& dev_ctx,
                 y_data,
                 x_data,
                 static_cast<T>(flag),
-                Out->mutable_data<T>());
+                Out->data<T>());
     } else {
       const int M = y_dims[y_ndim - 1];
       const int batch_size = Y.numel() / (M * N);
@@ -189,7 +189,7 @@ void MatMulFunction(const Context& dev_ctx,
                   y_data,
                   x_data,
                   static_cast<T>(flag),
-                  Out->mutable_data<T>());
+                  Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 4";
         blas.BatchedGEMM(CblasTrans,
@@ -201,7 +201,7 @@ void MatMulFunction(const Context& dev_ctx,
                          y_data,
                          x_data,
                          static_cast<T>(flag),
-                         Out->mutable_data<T>(),
+                         Out->data<T>(),
                          batch_size,
                          M * N,
                          0);
@@ -243,7 +243,7 @@ void MatMulFunction(const Context& dev_ctx,
       std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
     }
     Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims));
-    Out->mutable_data<T>();
+    Out->mutable_data<T>(dev_ctx.GetPlace());
 
     if (trans_x) {
       const int M = x_dims[x_ndim - 1];
@@ -257,7 +257,7 @@ void MatMulFunction(const Context& dev_ctx,
                   x_data,
                   y_data,
                   static_cast<T>(flag),
-                  Out->mutable_data<T>());
+                  Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 6";
         blas.BatchedGEMM(CblasTrans,
@@ -269,7 +269,7 @@ void MatMulFunction(const Context& dev_ctx,
                          x_data,
                          y_data,
                          static_cast<T>(flag),
-                         Out->mutable_data<T>(),
+                         Out->data<T>(),
                          batch_size,
                          M * N,
                          0);
@@ -284,7 +284,7 @@ void MatMulFunction(const Context& dev_ctx,
                 x_data,
                 y_data,
                 static_cast<T>(flag),
-                Out->mutable_data<T>());
+                Out->data<T>());
     }
     return;
   }
@@ -331,7 +331,7 @@ void MatMulFunction(const Context& dev_ctx,
   out_broadcast_dims[ndim - 1] = N;
 
   Out->ResizeAndAllocate(pten::framework::make_ddim(out_broadcast_dims));
-  Out->mutable_data<T>();
+  Out->mutable_data<T>(dev_ctx.GetPlace());
 
   const int batch_dim = ndim - 2;
   // broadcast message
@@ -367,7 +367,7 @@ void MatMulFunction(const Context& dev_ctx,
               x_data,
               y_data,
               static_cast<T>(flag),
-              Out->mutable_data<T>());
+              Out->data<T>());
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
@@ -378,7 +378,7 @@ void MatMulFunction(const Context& dev_ctx,
                 y_data,
                 x_data,
                 static_cast<T>(flag),
-                Out->mutable_data<T>());
+                Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 10";
       blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
@@ -390,7 +390,7 @@ void MatMulFunction(const Context& dev_ctx,
                        x_data,
                        y_data,
                        static_cast<T>(flag),
-                       Out->mutable_data<T>(),
+                       Out->data<T>(),
                        out_batch_size,
                        0,
                        K * N);
@@ -407,7 +407,7 @@ void MatMulFunction(const Context& dev_ctx,
                 x_data,
                 y_data,
                 static_cast<T>(flag),
-                Out->mutable_data<T>());
+                Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 12";
       blas.BatchedGEMM(CblasTrans,
@@ -419,7 +419,7 @@ void MatMulFunction(const Context& dev_ctx,
                        x_data,
                        y_data,
                        static_cast<T>(flag),
-                       Out->mutable_data<T>(),
+                       Out->data<T>(),
                        out_batch_size,
                        M * K,
                        0);
@@ -435,7 +435,7 @@ void MatMulFunction(const Context& dev_ctx,
                      x_data,
                      y_data,
                      static_cast<T>(flag),
-                     Out->mutable_data<T>(),
+                     Out->data<T>(),
                      out_batch_size,
                      M * K,
                      K * N);
@@ -454,7 +454,7 @@ void MatMulFunction(const Context& dev_ctx,
 
       x_ptr[i] = x_data + x_index * M * K;
       y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = Out->mutable_data<T>() + i * M * N;
+      out_ptr[i] = Out->data<T>() + i * M * N;
       IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
     }
     VLOG(3) << "MatMul's case 14";
diff --git a/paddle/pten/kernels/impl/sign_kernel_impl.h b/paddle/pten/kernels/impl/sign_kernel_impl.h
index 655cda762ee1a03c451981d3890cd3176390004f..54c1464c9e0221d5cc17c0db29fd7c2ce5ebf0f1 100644
--- a/paddle/pten/kernels/impl/sign_kernel_impl.h
+++ b/paddle/pten/kernels/impl/sign_kernel_impl.h
@@ -26,7 +26,7 @@ template <typename T, typename Context>
 void SignKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
   auto eigen_out = pten::EigenVector<T>::Flatten(*out);
   auto eigen_x = pten::EigenVector<T>::Flatten(x);
 
diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc
index 7f58bbbd3732d0f66ad47a2d2f011e20b3d9a55a..9bfad22374c9f0c840634a16bfff45849e8ef60a 100644
--- a/paddle/pten/kernels/reshape_kernel.cc
+++ b/paddle/pten/kernels/reshape_kernel.cc
@@ -27,12 +27,15 @@ void ReshapeKernel(const Context& dev_ctx,
                    const ScalarArray& shape,
                    DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
-  if (x.data() == out->data() && x.numel() == out->numel()) {
+  if (x.initialized() && x.Holder() == out->Holder()) {
     out->ResizeAndAllocate(out_meta.dims);
     return;
   }
+
+  out->Resize(x.dims());
+  out->mutable_data(x.place());
   pten::Copy(dev_ctx, x, false, out);
-  out->ResizeAndAllocate(out_meta.dims);
+  out->Resize(out_meta.dims);
   out->ResetLoD(x.lod());
 }
 
diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc
index 3287fa1f7a8572cd1fa12e66b369d907b988c4f6..56b79061f75f680cfc82b54d18733769b50b07b3 100644
--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -30,7 +30,7 @@ void Copy(const Context& dev_ctx,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
-  auto* dst_ptr = dst->mutable_data();
+  auto* dst_ptr = dst->mutable_data(dev_ctx.GetPlace());
   const auto& src_place = src.place();
   const auto& dst_place = dst->place();
 
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index 0a3b56e3f18d4dfccfdf5f56e5b690fffcd33ddc..b87bebacab7d98f70255df407b2494a1e431f708 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -37,7 +37,8 @@ TEST(API, cast) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   for (int i = 0; i < dense_x->numel(); i++) {
     dense_x_data[i] = i;
diff --git a/paddle/pten/tests/api/test_conj_api.cc b/paddle/pten/tests/api/test_conj_api.cc
index c17b0f23f4f6b7751009b69b7a504570c1f70a9d..0273737347ec380bb5517e0fb0097ef6a5e1e142 100644
--- a/paddle/pten/tests/api/test_conj_api.cc
+++ b/paddle/pten/tests/api/test_conj_api.cc
@@ -37,7 +37,8 @@ TEST(API, conj) {
       pten::DenseTensorMeta(pten::DataType::COMPLEX64,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<paddle::complex64>();
+  auto* dense_x_data =
+      dense_x->mutable_data<paddle::complex64>(paddle::platform::CPUPlace());
 
   for (size_t i = 0; i < 3; ++i) {
     for (size_t j = 0; j < 10; ++j) {
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 97616d0cbcd57750c5e8ee3464fb167313713fc8..6de8943a467666123a9235d5113e43e4e7fffcea 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -37,14 +37,16 @@ TEST(API, dot) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y->mutable_data<float>();
+  auto* dense_y_data =
+      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum[3] = {0.0, 0.0, 0.0};
   for (size_t i = 0; i < 3; ++i) {
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index 17a6ffde9df0abf5be8333b21b2cef432ce89b1f..df1c6278d96fdefa97969723cba785814f8636d3 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -37,14 +37,16 @@ TEST(API, add) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y->mutable_data<float>();
+  auto* dense_y_data =
+      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
@@ -91,14 +93,16 @@ TEST(API, subtract) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y->mutable_data<float>();
+  auto* dense_y_data =
+      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   float sub[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
@@ -145,14 +149,16 @@ TEST(API, divide) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y->mutable_data<float>();
+  auto* dense_y_data =
+      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   float div[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
@@ -199,14 +205,16 @@ TEST(API, multiply) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y->mutable_data<float>();
+  auto* dense_y_data =
+      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   float mul[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
diff --git a/paddle/pten/tests/api/test_empty_api.cc b/paddle/pten/tests/api/test_empty_api.cc
index f38e91b02b7051800820d9547a4cf68ade5cc67d..72f9100f7b3b7166937e59826985bd99d08b9ecb 100644
--- a/paddle/pten/tests/api/test_empty_api.cc
+++ b/paddle/pten/tests/api/test_empty_api.cc
@@ -47,10 +47,8 @@ TEST(API, empty_like) {
   ASSERT_EQ(out.dims().size(), 2);
   ASSERT_EQ(out.dims()[0], 3);
   ASSERT_EQ(out.numel(), 6);
-  ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
   ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
-  ASSERT_EQ(out.initialized(), true);
 }
 
 TEST(API, empty1) {
@@ -63,7 +61,8 @@ TEST(API, empty1) {
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
-  auto* shape_data = dense_shape->mutable_data<int64_t>();
+  auto* shape_data =
+      dense_shape->mutable_data<int64_t>(paddle::platform::CPUPlace());
   shape_data[0] = 2;
   shape_data[1] = 3;
 
@@ -76,10 +75,8 @@ TEST(API, empty1) {
   ASSERT_EQ(out.shape().size(), 2UL);
   ASSERT_EQ(out.shape()[0], 2);
   ASSERT_EQ(out.numel(), 6);
-  ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
   ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
-  ASSERT_EQ(out.initialized(), true);
 }
 
 TEST(API, empty2) {
@@ -91,7 +88,7 @@ TEST(API, empty2) {
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
-  dense_scalar->mutable_data<int32_t>()[0] = 2;
+  dense_scalar->mutable_data<int32_t>(paddle::platform::CPUPlace())[0] = 2;
 
   paddle::experimental::Tensor shape_scalar1(dense_scalar);
   paddle::experimental::Tensor shape_scalar2(dense_scalar);
@@ -103,10 +100,8 @@ TEST(API, empty2) {
   ASSERT_EQ(out.shape().size(), 2UL);
   ASSERT_EQ(out.shape()[0], 2);
   ASSERT_EQ(out.numel(), 4);
-  ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
   ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
-  ASSERT_EQ(out.initialized(), true);
 }
 
 TEST(API, empty3) {
@@ -117,10 +112,8 @@ TEST(API, empty3) {
   ASSERT_EQ(out.shape().size(), 2UL);
   ASSERT_EQ(out.shape()[0], 2);
   ASSERT_EQ(out.numel(), 6);
-  ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::INT32);
   ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
-  ASSERT_EQ(out.initialized(), true);
 }
 
 }  // namespace tests
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index 7910cc840f5efdee10406d81bcbda1385e4eb39c..4b78d142aefb2aa60025f10a4a29e02182cac830 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -37,7 +37,8 @@ TEST(API, full_like) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
   dense_x_data[0] = 0;
 
   float val = 1.0;
@@ -72,7 +73,8 @@ TEST(API, zeros_like) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
   dense_x_data[0] = 1;
 
   paddle::experimental::Tensor x(dense_x);
@@ -105,7 +107,8 @@ TEST(API, ones_like) {
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<int32_t>();
+  auto* dense_x_data =
+      dense_x->mutable_data<int32_t>(paddle::platform::CPUPlace());
   dense_x_data[0] = 0;
 
   paddle::experimental::Tensor x(dense_x);
@@ -139,7 +142,8 @@ TEST(API, full1) {
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
-  auto* shape_data = dense_shape->mutable_data<int64_t>();
+  auto* shape_data =
+      dense_shape->mutable_data<int64_t>(paddle::platform::CPUPlace());
   shape_data[0] = 2;
   shape_data[1] = 3;
 
@@ -148,7 +152,7 @@ TEST(API, full1) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
-  dense_scalar->mutable_data<float>()[0] = 1.0;
+  dense_scalar->mutable_data<float>(paddle::platform::CPUPlace())[0] = 1.0;
 
   paddle::experimental::Tensor value(dense_scalar);
 
@@ -185,7 +189,7 @@ TEST(API, full2) {
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
-  dense_scalar->mutable_data<int32_t>()[0] = 2;
+  dense_scalar->mutable_data<int>(paddle::platform::CPUPlace())[0] = 2;
 
   paddle::experimental::Tensor shape_scalar1(dense_scalar);
   paddle::experimental::Tensor shape_scalar2(dense_scalar);
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index cf8fa9cb1895fb9d7f04059ae979050a280a40fa..f3b80f7db571553776583379159eeaf57f3cda72 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -37,7 +37,8 @@ TEST(API, flatten) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   for (int i = 0; i < dense_x->numel(); i++) {
     dense_x_data[i] = i;
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index 08e0e888b99edd86dd4b05c7998c90596e646881..7342916c514ec30620db6928e2a2855f0ff3b61c 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -38,14 +38,16 @@ TEST(API, matmul_cpu) {
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y->mutable_data<float>();
+  auto* dense_y_data =
+      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   for (size_t i = 0; i < 9; ++i) {
     dense_x_data[i] = 1.0;
@@ -87,14 +89,14 @@ TEST(API, matmul_cuda) {
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
-  auto* ref_x_data = ref_x->mutable_data<float>();
+  auto* ref_x_data = ref_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto ref_y = std::make_shared<pten::DenseTensor>(
       alloc_cpu.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
-  auto* ref_y_data = ref_y->mutable_data<float>();
+  auto* ref_y_data = ref_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   for (size_t i = 0; i < 9; ++i) {
     ref_x_data[i] = 1.0;
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index a7b85cff12cc1cfb0070a7527d653cf42807dbe5..046db05ca2bbb79abd636014f7d8f50deeb3a099 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -37,7 +37,8 @@ TEST(API, mean) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum = 0.0;
   for (size_t i = 0; i < 12; ++i) {
diff --git a/paddle/pten/tests/api/test_pten_tensor.cc b/paddle/pten/tests/api/test_pten_tensor.cc
index a28f7ca2ca2e685ffebfd9ceb9906e245fb80fce..e6e2730a94c5500341636e5100b148b7947986ed 100644
--- a/paddle/pten/tests/api/test_pten_tensor.cc
+++ b/paddle/pten/tests/api/test_pten_tensor.cc
@@ -58,11 +58,11 @@ void TestAPIPlace() {
   std::vector<int64_t> tensor_shape = {5, 5};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto t1 = experimental::Tensor(paddle::PlaceType::kGPU, tensor_shape);
-  t1.mutable_data<float>();
+  t1.mutable_data<float>(paddle::PlaceType::kGPU);
   CHECK((paddle::PlaceType::kGPU == t1.place()));
 #endif
   auto t2 = experimental::Tensor(paddle::PlaceType::kCPU, tensor_shape);
-  t2.mutable_data<float>();
+  t2.mutable_data<float>(paddle::PlaceType::kCPU);
   CHECK((paddle::PlaceType::kCPU == t2.place()));
 }
 
@@ -80,29 +80,30 @@ void TestAPISlice() {
   std::vector<int64_t> tensor_shape_sub2 = {1, 5, 5};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto t1 = experimental::Tensor(paddle::PlaceType::kGPU, tensor_shape_origin1);
-  t1.mutable_data<float>();
+  t1.mutable_data<float>(paddle::PlaceType::kGPU);
   CHECK(t1.slice(0, 5).shape() == tensor_shape_origin1);
   CHECK(t1.slice(0, 3).shape() == tensor_shape_sub1);
   auto t2 = experimental::Tensor(paddle::PlaceType::kGPU, tensor_shape_origin2);
-  t2.mutable_data<float>();
+  t2.mutable_data<float>(paddle::PlaceType::kGPU);
   CHECK(t2.slice(4, 5).shape() == tensor_shape_sub2);
 #endif
   auto t3 = experimental::Tensor(paddle::PlaceType::kCPU, tensor_shape_origin1);
-  t3.mutable_data<float>();
+  t3.mutable_data<float>(paddle::PlaceType::kCPU);
   CHECK(t3.slice(0, 5).shape() == tensor_shape_origin1);
   CHECK(t3.slice(0, 3).shape() == tensor_shape_sub1);
   auto t4 = experimental::Tensor(paddle::PlaceType::kCPU, tensor_shape_origin2);
-  t4.mutable_data<float>();
+  t4.mutable_data<float>(paddle::PlaceType::kCPU);
   CHECK(t4.slice(4, 5).shape() == tensor_shape_sub2);
 
   // Test writing function for sliced tensor
   auto t = InitCPUTensorForTest<float>();
   auto t_sliced = t.slice(0, 1);
-  auto* t_sliced_data_ptr = t_sliced.mutable_data<float>();
+  auto* t_sliced_data_ptr =
+      t_sliced.mutable_data<float>(paddle::PlaceType::kCPU);
   for (int64_t i = 0; i < t_sliced.size(); i++) {
     t_sliced_data_ptr[i] += static_cast<float>(5);
   }
-  auto* t_data_ptr = t.mutable_data<float>();
+  auto* t_data_ptr = t.mutable_data<float>(paddle::PlaceType::kCPU);
   for (int64_t i = 0; i < t_sliced.size(); i++) {
     CHECK_EQ(t_data_ptr[i], static_cast<float>(10));
   }
@@ -112,7 +113,7 @@ template <typename T>
 paddle::DataType TestDtype() {
   std::vector<int64_t> tensor_shape = {5, 5};
   auto t1 = experimental::Tensor(paddle::PlaceType::kCPU, tensor_shape);
-  t1.template mutable_data<T>();
+  t1.template mutable_data<T>(paddle::PlaceType::kCPU);
   return t1.type();
 }
 
@@ -120,13 +121,13 @@ template <typename T>
 void TestCast(paddle::DataType data_type) {
   std::vector<int64_t> tensor_shape = {5, 5};
   auto t1 = experimental::Tensor(paddle::PlaceType::kCPU, tensor_shape);
-  t1.template mutable_data<T>();
+  t1.template mutable_data<T>(paddle::PlaceType::kCPU);
   auto t2 = t1.cast(data_type);
   CHECK(t2.type() == data_type);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto tg1 = experimental::Tensor(paddle::PlaceType::kGPU);
   tg1.reshape(tensor_shape);
-  tg1.template mutable_data<T>();
+  tg1.template mutable_data<T>(paddle::PlaceType::kGPU);
   auto tg2 = tg1.cast(data_type);
   CHECK(tg2.type() == data_type);
 #endif
@@ -194,7 +195,7 @@ void GroupTestDtype() {
 void TestInitilized() {
   experimental::Tensor test_tensor(paddle::PlaceType::kCPU, {1, 1});
   CHECK(test_tensor.is_initialized() == false);
-  test_tensor.mutable_data<float>();
+  test_tensor.mutable_data<float>(paddle::PlaceType::kCPU);
   CHECK(test_tensor.is_initialized() == true);
   float* tensor_data = test_tensor.mutable_data<float>();
   for (int i = 0; i < test_tensor.size(); i++) {
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index bfd1ea841443f2940d4cbf4e8a0cb2ead2decbd8..1f0d734a7ec6e9095a5efecdbe521c54eccc367e 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -37,7 +37,8 @@ TEST(API, reshape) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   for (int i = 0; i < dense_x->numel(); i++) {
     dense_x_data[i] = i;
@@ -69,14 +70,15 @@ TEST(API, reshape) {
 TEST(Tensor, old_reshape) {
   paddle::experimental::Tensor x(paddle::PlaceType::kCPU);
   x.reshape({3, 4});
+  x.mutable_data<float>(paddle::PlaceType::kCPU);
 
   ASSERT_EQ(x.shape()[0], 3);
   ASSERT_EQ(x.shape()[1], 4);
   ASSERT_EQ(x.numel(), 12);
   ASSERT_EQ(x.is_cpu(), true);
-  ASSERT_EQ(x.type(), pten::DataType::UNDEFINED);
+  ASSERT_EQ(x.type(), pten::DataType::FLOAT32);
   ASSERT_EQ(x.layout(), pten::DataLayout::NCHW);
-  ASSERT_EQ(x.initialized(), false);
+  ASSERT_EQ(x.initialized(), true);
 }
 
 }  // namespace tests
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index c0d5a89eeb7447d84516988ec8f34422c162267b..385d18aa784440db5ea5df4a248b7a175ee1921d 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -37,7 +37,8 @@ TEST(API, sum) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum = 0.0;
   for (size_t i = 0; i < 12; ++i) {
diff --git a/paddle/pten/tests/api/test_to_api.cc b/paddle/pten/tests/api/test_to_api.cc
index fa999aace66784ac117a183335f45ac7585cbfbb..11636e1c0147c18a5c8d5ff00e43e6b0aa89a2cb 100644
--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
@@ -35,7 +35,8 @@ paddle::experimental::Tensor CreateInputTensor() {
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<int64_t>();
+  auto* dense_x_data =
+      dense_x->mutable_data<int64_t>(paddle::platform::CPUPlace());
 
   for (int64_t i = 0; i < 12; ++i) {
     dense_x_data[i] = i;
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 363a57f036b9bb39fef9007e80073f83f3045bea..43e1480e2c41e3e0a5cc2a57597a83d306e709ed 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -4,6 +4,14 @@ cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils)
 cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel)
 cc_test(test_pten_device_context SRCS test_device_context.cc DEPS pten_context cpu_context)
+
+cc_test(test_ddim SRCS test_ddim.cc DEPS ddim)
+if(WITH_GPU)
+  nv_test(test_dim SRCS test_dim.cu DEPS ddim)
+elseif(WITH_ROCM)
+  hip_test(test_dim SRCS test_dim.cu DEPS ddim)
+endif()
+
 cc_test(selected_rows_test SRCS test_selected_rows.cc DEPS selected_rows)
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
diff --git a/paddle/pten/core/ddim_test.cc b/paddle/pten/tests/core/test_ddim.cc
similarity index 96%
rename from paddle/pten/core/ddim_test.cc
rename to paddle/pten/tests/core/test_ddim.cc
index 1903bbfdff135ebd19d0cef258401f13fe11fa04..b13fe68ef57afd5d67bed60d901f7fdc084aa1aa 100644
--- a/paddle/pten/core/ddim_test.cc
+++ b/paddle/pten/tests/core/test_ddim.cc
@@ -17,6 +17,9 @@
 #include "gtest/gtest.h"
 #include "paddle/pten/core/ddim.h"
 
+namespace pten {
+namespace tests {
+
 TEST(DDim, Equality) {
   // construct a DDim from an initialization list
   pten::framework::DDim ddim = pten::framework::make_ddim({9, 1, 5});
@@ -81,3 +84,6 @@ TEST(DDim, Print) {
   ss << ddim;
   EXPECT_EQ("2, 3, 4", ss.str());
 }
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 56722d35f325ec3180d440d531e7bbdc699078ce..ff337aa5e8da453c4f222a9cfc8c87109a643295 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -112,8 +112,6 @@ TEST(dense_tensor, resize) {
   CHECK_EQ(tensor_0.capacity(), 2u);
   tensor_0.ResizeAndAllocate({1, 2, 3});
   CHECK_EQ(tensor_0.capacity(), 6u);
-  tensor_0.mutable_data<int8_t>();
-  CHECK_EQ(tensor_0.capacity(), 6u);
 }
 
 TEST(dense_tensor, shallow_copy) {
diff --git a/paddle/pten/core/dim_test.cu b/paddle/pten/tests/core/test_dim.cu
similarity index 96%
rename from paddle/pten/core/dim_test.cu
rename to paddle/pten/tests/core/test_dim.cu
index 0f8d71c5d3b4cfb33b59e82709ac8bac51d18e6f..1c4a9c163f9434d06bc01e8976728651c156e233 100644
--- a/paddle/pten/core/dim_test.cu
+++ b/paddle/pten/tests/core/test_dim.cu
@@ -18,6 +18,9 @@
 #include "gtest/gtest.h"
 #include "paddle/pten/core/dim.h"
 
+namespace pten {
+namespace tests {
+
 __global__ void test(pten::framework::Dim<2>* o) {
   o[0] = pten::framework::make_dim(5, 6);
 }
@@ -94,4 +97,7 @@ TEST(Dim, Print) {
     ss << pten::framework::make_dim(8);
     EXPECT_EQ(ss.str(), "8");
   }
-}
\ No newline at end of file
+}
+
+}  // namespace tests
+}  // namespace pten
\ No newline at end of file
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index 3b1412a8e5f4e368e0776a7dbca1f6a47eac8ec3..c9d376b81a630c86a976f991d4edf693312f72ba 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -38,7 +38,8 @@ TEST(DEV_API, cast) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum = 0.0;
   for (size_t i = 0; i < 12; ++i) {
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index 51066d8ae478397378bf5986edf2c0704ae7e005..6714b57105bd24ca292e184d1ff90cf7d82e1b92 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -37,7 +37,8 @@ TEST(DEV_API, conj) {
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
 
-  auto* dense_x_data = dense_x.mutable_data<paddle::complex64>();
+  auto* dense_x_data =
+      dense_x.mutable_data<paddle::complex64>(paddle::platform::CPUPlace());
   for (size_t i = 0; i < 12; ++i) {
     dense_x_data[i] = paddle::complex64(i * 1.0, i * 1.0);
   }
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 4f8bd727716cef9641d33cc8603ac81631bc24e4..01dfa925d6c5a1a59f0ae28cd1b28127221ec950 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -39,7 +39,8 @@ TEST(DEV_API, copy) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({2, 3}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_src->mutable_data<float>();
+  auto* dense_x_data =
+      dense_src->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_dst = std::make_shared<pten::DenseTensor>(
       alloc.get(),
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index 1aa21b847fac4500c23b67f1fe9adb20331bc382..17416d33473d07b227cad38f74bce9c47dd8d520 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -52,7 +52,8 @@ TEST(DEV_API, empty_like) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 2}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
   dense_x_data[0] = 0;
 
   // 2. test API
@@ -96,7 +97,8 @@ TEST(DEV_API, full_like) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 2}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
   dense_x_data[0] = 0;
   float val = 1.0;
 
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index e4978d84c835cfaab55ad2b9b354d79872cccd79..27fecd3fcd9e38cdf48d96cc83f5d26705adc906 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -36,13 +36,15 @@ TEST(DEV_API, dot) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y.mutable_data<float>();
+  auto* dense_y_data =
+      dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum[3] = {0.0, 0.0, 0.0};
   for (size_t i = 0; i < 3; ++i) {
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index e5d9b05eec7b3ded758e4011c8f37f8d83e403fb..b3948843ee86c56987233bd5238edc3611a0fe9e 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -36,13 +36,15 @@ TEST(DEV_API, add) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y.mutable_data<float>();
+  auto* dense_y_data =
+      dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
@@ -82,13 +84,15 @@ TEST(DEV_API, subtract) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y.mutable_data<float>();
+  auto* dense_y_data =
+      dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
   float sub[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
@@ -128,13 +132,15 @@ TEST(DEV_API, divide) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y.mutable_data<float>();
+  auto* dense_y_data =
+      dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
   float div[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
@@ -174,13 +180,15 @@ TEST(DEV_API, multiply) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y.mutable_data<float>();
+  auto* dense_y_data =
+      dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
   float mul[3][10] = {0.0};
   for (size_t i = 0; i < 3; ++i) {
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index 78cd6261c3a41df1edbd9b8d8cc723f4fadcf0c0..fc463d1ff1e1cdaeee1641a2c88f621b0a12c4de 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -47,7 +47,8 @@ TEST(DEV_API, flatten) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   for (int i = 0; i < dense_x.numel(); i++) {
     dense_x_data[i] = i;
diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 76f775031921097d74d5ced0c34db8c45290d701..40419ecb3ad936d78eef9bfd7b0c6d0aff93d64c 100644
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -36,13 +36,15 @@ TEST(DEV_API, dot) {
                                             framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
 
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   DenseTensor dense_y(alloc.get(),
                       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                             framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y.mutable_data<float>();
+  auto* dense_y_data =
+      dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
   for (size_t i = 0; i < 9; ++i) {
     dense_x_data[i] = 1.0;
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 07ec30afad5ca92a62a95c37e6afd4bb9639dc04..786492d3a1b1bdf462fa82f76d919cbc4d47a623 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -35,7 +35,8 @@ TEST(DEV_API, mean) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum = 0.0;
   for (size_t i = 0; i < 12; ++i) {
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index dc90043305ca022347fa611ad08fd4a0bc2c79dd..ac2bb60cf9fe6b97e7d8dbb8e9204aa2c08335f9 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -37,7 +37,8 @@ TEST(DEV_API, reshape) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   for (int i = 0; i < dense_x.numel(); i++) {
     dense_x_data[i] = i;
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index 106835a204c65c3ae3f48aad512635bf1a1a9d6e..abb592cde3ff4276f9b0dbce3afb9d912a2e0f9f 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -36,7 +36,8 @@ TEST(DEV_API, scale) {
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
 
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
   for (size_t i = 0; i < 12; ++i) {
     dense_x_data[i] = i * 1.0;
   }
@@ -68,7 +69,8 @@ TEST(DEV_API, scale_host) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
   for (size_t i = 0; i < 12; ++i) {
     dense_x_data[i] = i * 1.0;
   }
@@ -77,7 +79,7 @@ TEST(DEV_API, scale_host) {
                           pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                 framework::make_ddim({1}),
                                                 pten::DataLayout::NCHW));
-  scale.mutable_data<float>()[0] = 2;
+  scale.data<float>()[0] = 2;
   float bias = 1;
   bool bias_after_scale = true;
 
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index 41d694a025f42e54a6dd347476deca7ba921c64c..595f0b96920ae24b2daadeca8e749d0232627720 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -35,7 +35,8 @@ TEST(DEV_API, sum) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   float sum = 0.0;
   for (size_t i = 0; i < 12; ++i) {
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 4ec43aa1e05ed448f50096ae840ff03775be4013..a2797adff251aea3535f86e5c423463d748c37b3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -70,7 +70,7 @@ class ShardingOptimizerStage2(Optimizer):
                  device="gpu",
                  **kw):
 
-        super().__init__(optim._learning_rate, params, kw)
+        # super().__init__(optim._learning_rate, params, kw)
 
         # Segmentation information
         self._dtype_rank_params = OrderedDict(
@@ -83,8 +83,6 @@ class ShardingOptimizerStage2(Optimizer):
         # Default information
         self._optim_defaults = kw
         self._optim = optim
-        self._ori_parameter_list = self._optim._parameter_list
-        self._ori_param_groups = self._optim._param_groups
 
         assert hasattr(self._optim, "_master_weights"
                        ), "Must use optimizer with _master_weights attribute"
@@ -336,24 +334,11 @@ class ShardingOptimizerStage2(Optimizer):
 
         if self.offload:
             params_list = [self.offload_params.buffer]
-        else:
-            # Synchronize optimizer parameters for the current rank
-            params_list = []
-            for dtype in self.dtype_rank_params.keys():
-                params_list.extend(self.dtype_rank_params[dtype][self.rank])
 
-        params_name_list = list(map(lambda p: p.name, params_list))
-        if not isinstance(self._optim._param_groups[0], dict):
-            self._optim._parameter_list = params_list
-            self._optim._param_groups = params_list
-        else:
-            for param_group in self._optim._param_groups:
-                p_group = []
-                for param in param_group['params']:
-                    if param.name in params_name_list:
-                        p_group.append(params_list[params_name_list.index(
-                            param.name)])
-                param_group['params'] = p_group
+            #TODO(Baibaifan): Offload will support param_groups later
+            if not isinstance(self._optim._param_groups[0], dict):
+                self._optim._parameter_list = params_list
+                self._optim._param_groups = params_list
 
         # Run the optimizer of the current rank step
         if self.offload:
@@ -371,10 +356,6 @@ class ShardingOptimizerStage2(Optimizer):
         # Synchronize all the updated shards in between the ranks
         self._broadcast_params()
 
-        # Return full parameters to optimizer parameters
-        self._optim._parameter_list = self._ori_parameter_list
-        self._optim._param_groups = self._ori_param_groups
-
     def _clear_cache(self):
         self.__segment_params.clear()
         self._dtype_rank_params.clear()
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index e5d04aac1551e64f63625722b08088eb3d8552b6..41c6f92230ab3e0e8de9aec0abdf920fad1ef232 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -33,7 +33,7 @@ from paddle.fluid.framework import ParamBase
 from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.distributed.collective import _get_global_group
 
-from .sharding_utils import Type, ShardingClipGrad
+from .sharding_utils import Type, ShardingClipGrad, device_guard
 from ..pp_utils.utils import _all_gather
 
 # CUDA alignment 256 bytes
@@ -56,6 +56,13 @@ class ShardingStage3(nn.Layer):
     .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
     """
 
+    # TODO (Baibaifan) 
+    # Feature Notes::
+    # 1. The model supports the segmentation of parameters by global ranks in layers.
+    # 2. Support communication flow and computing flow.
+    # 3. Support offload function.
+    # 4. Support the establishment of independent communication groups.
+
     def __init__(self,
                  layer,
                  optimizer,
@@ -77,6 +84,15 @@ class ShardingStage3(nn.Layer):
         self._offload = offload
         self._sync_comm = sync_comm
 
+        global DEV
+        DEV = "cpu" if paddle.get_device() == "cpu" else paddle.get_device(
+        ).split(":")[0]
+        global DEV_ID
+        DEV_ID = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+        global param2dtype
+        param2dtype = dict()
+
         # Communication group establishment
         self._group = dist.new_group(_get_global_group()
                                      .ranks) if group is None else group
@@ -85,6 +101,9 @@ class ShardingStage3(nn.Layer):
         self._rank = self._group.rank
         self._global_root_rank = 0  # picking rank 0 as the reference
         self._global_ranks = self._group.ranks
+
+        # Parameter segmentation for global ranks
+        # After flatten -> self._param2buffer_size, self._param2buffer, self._trainable_params
         self._param2buffer_size = dict()  # {param.name: size}
         self._param2buffer = dict(
         )  # {param.name: [(start0, end0),(start1, end1), ...]}
@@ -116,12 +135,16 @@ class ShardingStage3(nn.Layer):
         self._order_tracer = OrderedDict()
         self._order_tracer["order"] = 0
         self._order_tracer["layer"] = []
+
         # Register task flow
         self._task_flow = TaskFlow()
+
         # Register forward hooks
         self._register_forward_hooks(self._layer)
+
         # Register backward parameter hooks
         self._register_backward_hooks()
+
         # Redefine optimizer step and clear function
         self._redefine_opt_step()
         self._redefine_opt_clear()
@@ -152,7 +175,6 @@ class ShardingStage3(nn.Layer):
                 param, "fw_storage"
             ), "Find {} don't have fw_storage attribute.".format(param.name)
 
-            # param.bw_storage.zero_()
             param.fw_storage.clear_gradient(False)
             param.fw_storage._gradient_set_empty(False)
             param.bw_storage._clear()
@@ -192,6 +214,9 @@ class ShardingStage3(nn.Layer):
         return fw
 
     def _segment_rank_params(self, layer, name="last_layer"):
+        """
+        Flatten parameters according to layer.
+        """
         current_layer_params = _current_layer_params(layer)
         if current_layer_params:
             CHECK_LAYER[id(layer)] = name
@@ -201,6 +226,10 @@ class ShardingStage3(nn.Layer):
             self._segment_rank_params(sub_layer, name)
 
     def _flatten_layer_params(self, layer, current_layer_params):
+        """
+        Parameter segmentation and memory integration.
+        """
+
         def _add_manage_info(trainable_param):
             return _PartitionParam(trainable_param)
 
@@ -238,8 +267,13 @@ class ShardingStage3(nn.Layer):
 
             # 3.Flatten layer params and release other rank buffer
             self._param_storage(param, buffer_size)
+            # Record param's dtype
+            param2dtype[param.name] = param.dtype
 
     def _param_storage(self, param, buffer_size):
+        """
+        This is a function to simplify the handling of parameter InternalStorages.
+        """
         assert isinstance(buffer_size, int)
         value = np.zeros(
             buffer_size,
@@ -264,16 +298,31 @@ class ShardingStage3(nn.Layer):
         param._clear()
 
         # Current rank param_storage
-        param.fw_storage = core.VarBase(
-            buffer._slice(start, end), "slice@" + param.name)
+        if self._offload:
+            param.fw_storage = core.VarBase(
+                buffer._slice(start, end),
+                core.CPUPlace(), "slice@" + param.name)
+        else:
+            param.fw_storage = core.VarBase(
+                buffer._slice(start, end), "slice@" + param.name)
         param.status = "part"
 
         # Updata optimizer master weights
-        if param.dtype == Type.fp16.value:
+        if param.dtype == Type.fp16.value and not self._offload:
             self._optim._master_weights[param.fw_storage.name] = paddle.cast(
                 param.fw_storage, Type.fp32.value)
 
     def _register_forward_hooks(self, layer):
+        """
+        Register pylayer to manage memory slices.
+        There are four stages:
+        FW
+        1. Before the forward layers, synchronize the full parameters.
+        2. After the forward layers, release the full parameter and keep the parameter slice.
+        BW
+        3. Before the backward layers, synchronize the full parameters and create param's grad.
+        4. After the gradient accumulation, release the full parameter and keep the parameter slice.
+        """
         current_layer_params = _current_layer_params(layer)
         if current_layer_params:
             self._register_forward_all_hooks(layer, self._task_flow)
@@ -286,13 +335,13 @@ class ShardingStage3(nn.Layer):
             return ForwardPreHooks(layer, self._order_tracer,
                                    self._trainable_params, self._param2buffer,
                                    self._rank, self._group, self._sync_comm,
-                                   task_flow)
+                                   self._offload, task_flow)
 
         def _forward_post_hook(layer, inputs, outputs):
             return ForwardPostHooks.apply(
                 outputs, layer, self._order_tracer, self._trainable_params,
                 self._param2buffer, self._param2buffer_size, self._rank,
-                self._group, self._sync_comm, task_flow)
+                self._group, self._sync_comm, self._offload, task_flow)
 
         # register previous forward hooks
         sub_layer.register_forward_pre_hook(_forward_pre_hook)
@@ -302,6 +351,10 @@ class ShardingStage3(nn.Layer):
 
     @paddle.no_grad()
     def _sync_buffers(self):
+        """
+        Sync all the param buffers from all ranks (exp: batch norm statistics).
+        """
+
         for buffer in self._layer.buffers(include_sublayers=True):
             dist.broadcast(
                 buffer,
@@ -319,6 +372,9 @@ class ShardingStage3(nn.Layer):
             return getattr(self._layer, name)
 
     def _update_params(self):
+        """
+        Update parameters to optimizer memory slice.
+        """
         update_list = []
         assert len(self._trainable_params.keys()) > 0
         current_layer_params = self._layer.parameters(include_sublayers=True)
@@ -331,36 +387,35 @@ class ShardingStage3(nn.Layer):
                     param.name)
 
             if self._accumulate_grads:
-                param.bw_storage.scale_(scale=self._world_size_scaling)
+                if self._offload:
+                    with device_guard(device="cpu"):
+                        param.bw_storage.scale_(scale=self._world_size_scaling)
+                else:
+                    param.bw_storage.scale_(scale=self._world_size_scaling)
             param.fw_storage = _VarBaseWrapper(param)
             param.fw_storage._copy_gradient_from(param.bw_storage)
             update_list.append(param)
         return update_list
 
-    def get_all_parameters(self):
+    def get_all_parameters(self, convert2cpu=False):
+        """
+        Get the full parameters and return the corresponding task flows.
+        """
         assert len(self._trainable_params.keys()) > 0
         current_layer_params = self._layer.parameters(include_sublayers=True)
         trainable_params = list(
             filter(lambda x: x.trainable, current_layer_params))
-        for param in trainable_params:
-            if param.use_count > 0:
-                continue
-            assert hasattr(
-                param,
-                "fw_storage"), "Find {} don't have fw_storage attribute".format(
-                    param.name)
-
-            full_param = _all_gather(
-                param.fw_storage, self._group, use_calc_stream=True)
-            dist.wait(
-                tensor=full_param, group=self._group, use_calc_stream=True)
-            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
-                param)
-            param.value().get_tensor()._set_dims(param.shape)
-            param.fw_storage._clear()
-            param.fw_storage = None
-            param.status = "all"
-            param.use_count += 1
+        t_flow = _allgather_buffer(
+            trainable_params,
+            self._group,
+            use_calc_stream=True,
+            task_flow=TaskFlow(),
+            sync_wait=True,
+            offload=self._offload,
+            convert2cpu=convert2cpu)
+        if convert2cpu:
+            for param in current_layer_params:
+                t_flow.full_param[param.name]._share_buffer_to(param)
 
         self._optim._parameter_list = self._ori_parameter_list
         self._optim._param_groups = self._ori_param_groups
@@ -393,13 +448,28 @@ class ShardingStage3(nn.Layer):
                         use_calc_stream=True)
 
                     start, end = self._param2buffer[param.name][self._rank]
-                    if not self._accumulate_grads or param.bw_storage is None:
+                    if not self._accumulate_grads or param.bw_storage is None or not param.bw_storage.value(
+                    ).get_tensor()._is_initialized():
                         param.bw_storage = core.VarBase(
                             full_grad._slice(start, end)).detach().clone()
+                        if self._offload:
+                            param.bw_storage = _device2cpu(param.bw_storage,
+                                                           True)
                     else:
-                        param.bw_storage.add_(
-                            core.VarBase(full_grad._slice(start, end)).detach()
-                            .clone())
+                        if self._offload:
+                            cpu_grad = _device2cpu(
+                                core.VarBase(full_grad._slice(start, end))
+                                .detach().clone(), True)
+                            param.bw_storage = paddle.add(param.bw_storage,
+                                                          cpu_grad)
+                        else:
+                            # param.bw_storage.add_(
+                            #     core.VarBase(full_grad._slice(start, end))
+                            #     .detach().clone())
+                            param.bw_storage = paddle.add(
+                                param.bw_storage,
+                                core.VarBase(full_grad._slice(
+                                    start, end)).detach().clone())
                 param.clear_gradient(False)
                 param._gradient_set_empty(False)
                 tmp_var = self._task_flow.full_grad.pop(param.name)
@@ -410,15 +480,16 @@ class ShardingStage3(nn.Layer):
                     param.use_count = 0
                     param._clear()
                     start, end = self._param2buffer[param.name][self._rank]
-                    with paddle.amp.auto_cast(enable=False):
-                        param.fw_storage = core.VarBase(
-                            self._task_flow.full_param[param.name]._slice(start,
-                                                                          end),
-                            param.name + "@slice").detach().clone()
+                    param.fw_storage = core.VarBase(
+                        self._task_flow.full_param[param.name]._slice(
+                            start, end), param.name + "@slice").detach().clone()
                     param.status = "part"
                     tmp_var = self._task_flow.full_param.pop(param.name)
                     tmp_var._clear()
 
+                    if self._offload:
+                        param.fw_storage = _device2cpu(param.fw_storage, True)
+
         return reduce
 
     def _redefine_opt_step(self):
@@ -429,7 +500,11 @@ class ShardingStage3(nn.Layer):
         def _opt_step(self):
             if not update_scaler:
                 params_slice_func()
-            opt_step()
+            if self.offload:
+                with device_guard(device="cpu"):
+                    opt_step()
+            else:
+                opt_step()
 
         self._optim.step = MethodType(_opt_step, self._optim)
 
@@ -443,7 +518,7 @@ class ShardingStage3(nn.Layer):
 
 
 def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer, rank,
-                    group, sync_comm, task_flow):
+                    group, sync_comm, offload, task_flow):
 
     # Record layer's id
     layer_id = id(layer)
@@ -451,21 +526,28 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer, rank,
 
     if layer_id not in order_tracer.keys() or sync_comm:
         use_calc, sync_wait = True, True
+
+        # Whether to use calc stream
         task_flow.use_calc[layer_id] = use_calc
     else:
+        # Whether to use calc stream
         task_flow.use_calc[layer_id] = use_calc
-        _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
+        # wait current layer params
+        _wait_layer(trainable_params[layer_id], task_flow, group, use_calc,
+                    offload)
 
         if layer_id == order_tracer["layer"][-1]: return
         order_ = order_tracer[layer_id]
         layer_id = order_tracer["layer"][order_ + 1]
+
     _allgather_buffer(
-        layer_id,
-        trainable_params,
+        trainable_params[layer_id],
         group,
         use_calc_stream=use_calc,
         task_flow=task_flow,
-        sync_wait=sync_wait)
+        sync_wait=sync_wait,
+        offload=offload)
+
     return
 
 
@@ -473,15 +555,20 @@ class ForwardPostHooks(PyLayer):
     @staticmethod
     def forward(ctx, inputs, layer, order_tracer, trainable_params,
                 param2buffer, param2buffer_size, rank, group, sync_comm,
-                task_flow):
-        _release_param(layer, trainable_params, param2buffer, rank, task_flow)
+                offload, task_flow):
 
         layer_id = id(layer)
+        # release current layer full params
+        _release_param(trainable_params[layer_id], param2buffer, rank,
+                       task_flow, offload)
+
         if layer_id not in order_tracer.keys():
             order_ = order_tracer["order"]
             order_tracer[layer_id] = order_
             order_tracer["order"] += 1
             order_tracer["layer"].append(layer_id)
+
+        #Record bw info 
         ctx.order_tracer = order_tracer
         ctx.task_flow = task_flow
         ctx.group = group
@@ -489,6 +576,7 @@ class ForwardPostHooks(PyLayer):
         ctx.sync_comm = sync_comm
         ctx.trainable_params = trainable_params
         ctx.param2buffer_size = param2buffer_size
+        ctx.offload = offload
 
         return inputs
 
@@ -502,31 +590,39 @@ class ForwardPostHooks(PyLayer):
         trainable_params = ctx.trainable_params
         param2buffer_size = ctx.param2buffer_size
         sync_comm = ctx.sync_comm
+        offload = ctx.offload
         layer_id = id(layer)
         use_calc, sync_wait = False, False
+
+        # Allgather params synchronization
         if sync_comm:
             use_calc, sync_wait = True, True
             _allgather_buffer(
-                layer_id,
-                trainable_params,
+                trainable_params[layer_id],
                 group,
                 use_calc_stream=use_calc,
                 task_flow=task_flow,
-                sync_wait=sync_wait)
+                sync_wait=sync_wait,
+                offload=offload)
         else:
-            _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
-        _create_params_grad(layer, trainable_params, param2buffer_size,
+            _wait_layer(trainable_params[layer_id], task_flow, group, use_calc,
+                        offload)
+
+        # Create params's grad
+        _create_params_grad(trainable_params[layer_id], param2buffer_size,
                             task_flow)
+
+        # Whether to use calc stream
         task_flow.use_calc[layer_id] = use_calc
         if layer_id != order_tracer["layer"][0] and not sync_comm:
             layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1]
             _allgather_buffer(
-                layer_next_id,
-                trainable_params,
+                trainable_params[layer_next_id],
                 group,
                 use_calc_stream=use_calc,
                 task_flow=task_flow,
-                sync_wait=sync_wait)
+                sync_wait=sync_wait,
+                offload=offload)
 
         return args
 
@@ -547,8 +643,12 @@ class TaskFlow:
         self.callback = callback
 
 
-def _release_param(layer, trainable_params, param2buffer, rank, task_flow):
-    for param in trainable_params[id(layer)]:
+def _release_param(trainable_params,
+                   param2buffer,
+                   rank,
+                   task_flow,
+                   offload=False):
+    for param in trainable_params:
         # async communicate share weight not clear
         param.use_count -= 1
         if param.use_count == 0:
@@ -562,11 +662,18 @@ def _release_param(layer, trainable_params, param2buffer, rank, task_flow):
                 param.status = "part"
                 tmp_var = task_flow.full_param.pop(param.name)
                 tmp_var._clear()
+
+                if offload:
+                    param.fw_storage = _device2cpu(param.fw_storage)
     return
 
 
-def _wait_layer(trainable_params, layer_id, task_flow, group, use_calc_stream):
-    for param in trainable_params[layer_id]:
+def _wait_layer(trainable_params,
+                task_flow,
+                group,
+                use_calc_stream,
+                offload=False):
+    for param in trainable_params:
         if param.status == "all":
             param.use_count += 1
             continue
@@ -576,36 +683,43 @@ def _wait_layer(trainable_params, layer_id, task_flow, group, use_calc_stream):
                 paddle.device.cuda.synchronize()
             core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
                 param)
-            param.value().get_tensor()._set_dims(param.shape)
             param.fw_storage._clear()
             param.fw_storage = None
             param.status = "all"
             param.use_count += 1
         else:
             _allgather_buffer(
-                layer_id,
                 trainable_params,
                 group,
-                use_calc_stream,
-                task_flow,
-                sync_wait=True)
+                use_calc_stream=True,
+                task_flow=task_flow,
+                sync_wait=True,
+                offload=offload)
             break
     return task_flow
 
 
-def _allgather_buffer(layer_id,
-                      trainable_params,
+def _allgather_buffer(trainable_params,
                       group,
                       use_calc_stream,
                       task_flow,
-                      sync_wait=False):
-    for param in trainable_params[layer_id]:
+                      sync_wait=False,
+                      offload=False,
+                      convert2cpu=False):
+
+    for param in trainable_params:
         if param.status == "all":
             param.use_count += 1
             continue
+
+        if offload:
+            param.fw_storage = _cpu2device(param)
+
         with paddle.amp.auto_cast(enable=False):
             full_param = _all_gather(
                 param.fw_storage, group, use_calc_stream=use_calc_stream)
+
+        # Allgather current layer in the 1st step 
         if sync_wait:
             with paddle.amp.auto_cast(enable=False):
                 dist.wait(
@@ -614,18 +728,26 @@ def _allgather_buffer(layer_id,
                     use_calc_stream=use_calc_stream)
             core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
                 param)
-            param.value().get_tensor()._set_dims(param.shape)
             param.fw_storage._clear()
             param.fw_storage = None
             param.status = "all"
             param.use_count += 1
         task_flow.full_param[param.name] = full_param
+
+        # parameter converts to cpu 
+        if convert2cpu:
+            p_name = param.name
+            param = _device2cpu(param)
+            tmp_var = task_flow.full_param.pop(p_name)
+            tmp_var._clear()
+            task_flow.full_param[p_name] = param
+
     return task_flow
 
 
 @paddle.no_grad()
-def _create_params_grad(layer, trainable_params, param2buffer_size, task_flow):
-    for param in trainable_params[id(layer)]:
+def _create_params_grad(trainable_params, param2buffer_size, task_flow):
+    for param in trainable_params:
         if param.name in task_flow.full_grad.keys():
             continue
         assert isinstance(param2buffer_size[param.name], int)
@@ -668,6 +790,23 @@ def _OptimizerWrapper(optimizer, offload, group, update_params_slice):
     return optimizer
 
 
+def _device2cpu(trans_param, convert_dtype=False):
+    if convert_dtype:
+        trans_param = paddle.cast(trans_param, Type.fp32.value)
+    tmp_p = trans_param.cpu()
+    trans_param._clear()
+    return tmp_p
+
+
+def _cpu2device(param):
+    tmp_p = param.fw_storage.cuda(DEV_ID)
+    param.fw_storage._clear()
+    if tmp_p.dtype == Type.fp32.value and param2dtype[
+            param.name] == Type.fp16.value:
+        tmp_p = paddle.cast(tmp_p, Type.fp16.value)
+    return tmp_p
+
+
 def _current_layer_params(layer):
     return layer.parameters(
         include_sublayers=False) + list(layer.extra_parameters) if hasattr(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 51e85901e7d558c697382c5d433c2cc7661c3213..04474dcdfe5091b2986eeeedb9870c00d83970db 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -548,8 +548,10 @@ def func_to_source_code(function, dedent=True):
             "The type of 'function' should be a function or method, but received {}.".
             format(type(function).__name__))
     source_code_list, _ = inspect.getsourcelines(function)
+    # Replace comments with blank lines so that error messages are not misplaced
     source_code_list = [
-        line for line in source_code_list if not line.lstrip().startswith('#')
+        line if not line.lstrip().startswith('#') else '\n'
+        for line in source_code_list
     ]
     source_code = ''.join(source_code_list)
     if dedent:
diff --git a/python/paddle/fluid/tests/custom_op/attr_test_op.cc b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
index 14cb0aa7c716d8449c672231f5399027275f8c5d..1c79d9a26aee3409fc2a32b755abcd45f4ca06c3 100644
--- a/python/paddle/fluid/tests/custom_op/attr_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
@@ -137,7 +137,9 @@ std::vector<paddle::Tensor> AttrTestForward(
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   // Check attrs value
@@ -175,12 +177,13 @@ std::vector<paddle::Tensor> AttrTestBackward(
     const std::vector<std::string>& str_vec_attr) {
   auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
 
-  PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
-                               assign_cpu_kernel<data_t>(
-                                   grad_out.data<data_t>(),
-                                   grad_x.mutable_data<data_t>(),
-                                   grad_out.size());
-                             }));
+  PD_DISPATCH_FLOATING_TYPES(
+      grad_out.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            grad_out.data<data_t>(),
+            grad_x.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            grad_out.size());
+      }));
 
   CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr);
 
@@ -203,7 +206,9 @@ std::vector<paddle::Tensor> ConstAttrTestForward(
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   // Check attrs value
@@ -241,12 +246,13 @@ std::vector<paddle::Tensor> ConstAttrTestBackward(
     const std::vector<std::string>& str_vec_attr) {
   auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
 
-  PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
-                               assign_cpu_kernel<data_t>(
-                                   grad_out.data<data_t>(),
-                                   grad_x.mutable_data<data_t>(),
-                                   grad_out.size());
-                             }));
+  PD_DISPATCH_FLOATING_TYPES(
+      grad_out.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            grad_out.data<data_t>(),
+            grad_x.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            grad_out.size());
+      }));
 
   CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr);
 
diff --git a/python/paddle/fluid/tests/custom_op/concat_and_split.h b/python/paddle/fluid/tests/custom_op/concat_and_split.h
index 9f24cc43699773fc531ccd68d4219ebcdfdab8eb..cbec4653a207d9b92da48d0cad79288159329a6a 100644
--- a/python/paddle/fluid/tests/custom_op/concat_and_split.h
+++ b/python/paddle/fluid/tests/custom_op/concat_and_split.h
@@ -47,7 +47,7 @@ void ConcatCpuKernel(const std::vector<paddle::Tensor>& ins,
   int64_t out_cols = 0;
   auto ins_cols = GetCols(ins, out_rows, &out_cols);
 
-  auto* out_data = out->mutable_data<data_t>();
+  auto* out_data = out->mutable_data<data_t>(paddle::PlaceType::kCPU);
   int64_t col_idx = 0;
   for (size_t i = 0; i < num; ++i) {
     int64_t col_len = ins_cols[i];
@@ -76,7 +76,9 @@ void SplitCpuKernel(const paddle::Tensor& in,
     int64_t col_idx = 0;
     for (size_t j = 0; j < num; ++j) {
       int64_t col_len = out_cols[j];
-      auto* out_data = outs->at(j).mutable_data<data_t>() + i * col_len;
+      auto* out_data =
+          outs->at(j).mutable_data<data_t>(paddle::PlaceType::kCPU) +
+          i * col_len;
       std::memcpy(out_data, in_data + col_idx, sizeof(data_t) * col_len);
       col_idx += col_len;
     }
diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
index b9c10f479e0a39eb8e33ffceb30e8eb9cc8efa9e..ae60799d239467ff8637f2e494315c2ac8c08744 100644
--- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@@ -76,7 +76,9 @@ std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "ConjCPUKernel", ([&] {
         ConjCPUKernel<data_t>(
-            x.data<data_t>(), x.size(), out.mutable_data<data_t>());
+            x.data<data_t>(),
+            x.size(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU));
       }));
 
   return {out};
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
index 0f7d323b5451efba5a503d9039a03531e1773efb..d5f161fc5b775d92627bfcd0b0f4b0fa347d02be 100644
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -32,7 +32,9 @@ std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
   PD_DISPATCH_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   return {out};
@@ -50,7 +52,9 @@ std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   return {out};
@@ -67,7 +71,9 @@ std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
   PD_DISPATCH_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   return {out};
@@ -85,7 +91,9 @@ std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   return {out};
@@ -103,7 +111,9 @@ std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
   PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   return {out};
@@ -120,7 +130,9 @@ std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
         assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+            x.data<data_t>(),
+            out.mutable_data<data_t>(paddle::PlaceType::kCPU),
+            x.size());
       }));
 
   return {out};
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index e08b4db1e98def0257e5123147a3fadc1130f444..9206d744990008496e7af43d67e000f9d00f6dab 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -29,7 +29,6 @@ from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import Shar
 
 seed = 2021
 epoch = 2
-batch_size = 32
 linear_size = 1000
 
 strategy = fleet.DistributedStrategy()
@@ -86,6 +85,7 @@ def optimizer_setting(model, use_pure_fp16, opt_group=False):
 
 def train_mlp(model,
               sharding_stage,
+              batch_size=100,
               use_pure_fp16=False,
               accumulate_grad=False,
               opt_group=False):
@@ -103,16 +103,13 @@ def train_mlp(model,
     if sharding_stage == 2:
         optimizer = ShardingOptimizerStage2(
             params=model.parameters(), optim=optimizer, group=group)
-        if accumulate_grad:
-            model = ShardingStage2(
-                model,
-                optimizer,
-                group=group,
-                buffer_max_size=2**21,
-                accumulate_grads=accumulate_grad)
-        else:
-            model = ShardingStage2(
-                model, optimizer, group=group, buffer_max_size=2**21)
+
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            buffer_max_size=2**21,
+            accumulate_grads=batch_size == 20)
     else:
         optimizer = fleet.distributed_optimizer(optimizer)
         model = fleet.distributed_model(model)
@@ -145,12 +142,13 @@ def train_mlp(model,
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
             avg_loss.backward()
 
+            if not accumulate_grad:
+                optimizer.step()
+                optimizer.clear_grad()
+
+        if accumulate_grad:
             optimizer.step()
             optimizer.clear_grad()
-
-            if accumulate_grad and batch_id == 2:
-                return model.parameters()
-
     return model.parameters()
 
 
@@ -166,25 +164,22 @@ def test_dp_stage2():
     mlp3.set_state_dict(state_dict)
     mlp4.set_state_dict(state_dict)
     dp_params = train_mlp(
-        mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=True)
+        mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False)
     stage2_params = train_mlp(
-        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=False)
     for i in range(len(dp_params)):
-        for j in range(len(stage2_params)):
-            if dp_params[i].name == stage2_params[j].name:
-                np.testing.assert_allclose(
-                    dp_params[i].numpy(), stage2_params[j].numpy(), rtol=1e-6)
+        np.testing.assert_allclose(
+            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
 
     stage2_params = train_mlp(mlp3, sharding_stage=2)
     stage2_accumulate_grad = train_mlp(
-        mlp4, sharding_stage=2, accumulate_grad=True)
+        mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True)
     for i in range(len(stage2_params)):
-        for j in range(len(stage2_accumulate_grad)):
-            if stage2_params[i].name == stage2_accumulate_grad[j].name:
-                np.testing.assert_allclose(
-                    stage2_params[i].numpy(),
-                    stage2_accumulate_grad[j].numpy(),
-                    rtol=1e-6)
+        np.testing.assert_allclose(
+            stage2_params[i].numpy(),
+            stage2_accumulate_grad[i].numpy(),
+            rtol=1e-5,
+            atol=1e-5)
 
     return
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index 5b0bec9c454b0fdfaea4d96ac821bfe8f859eff5..ddd31bc057f2e3f6eeeae571615f5e2991e6a8a2 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -30,7 +30,6 @@ from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import Shar
 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
 
 epoch = 10
-batch_size = 32
 paddle.seed(2021)
 np.random.seed(2021)
 base_lr = 0.1
@@ -66,10 +65,10 @@ def reader_decorator(linear_size=1000):
 
 def optimizer_setting(model, use_pure_fp16, opt_group=False):
     clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-    optimizer = paddle.optimizer.AdamW(
+    optimizer = paddle.optimizer.Momentum(
         parameters=[{
-            "params": model.parameters()
-        }] if opt_group else model.parameters(),
+            "params": list(model.parameters())
+        }] if opt_group else list(model.parameters()),
         learning_rate=0.001,
         weight_decay=0.00001,
         grad_clip=clip,
@@ -82,6 +81,7 @@ def train_mlp(model,
               sharding_stage,
               use_pure_fp16=False,
               accumulate_grad=False,
+              batch_size=100,
               opt_group=False,
               recompute=False):
     group = paddle.distributed.new_group([0, 1])
@@ -104,10 +104,14 @@ def train_mlp(model,
             optimizer,
             group=group,
             buffer_max_size=2**21,
-            accumulate_grads=accumulate_grad)
+            accumulate_grads=batch_size == 20)
     elif sharding_stage == 3:
         model = ShardingStage3(
-            model, optimizer=optimizer, group=group, sync_comm=recompute)
+            model,
+            optimizer=optimizer,
+            group=group,
+            accumulate_grads=batch_size == 20,
+            sync_comm=recompute)
 
     train_reader = paddle.batch(
         reader_decorator(), batch_size=batch_size, drop_last=True)
@@ -131,21 +135,22 @@ def train_mlp(model,
                 loss = paddle.nn.functional.cross_entropy(
                     input=out, label=label)
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            if not use_pure_fp16:
+                avg_loss.backward()
+            else:
+                scaler.scale(avg_loss).backward()
+
             if not accumulate_grad:
                 if not use_pure_fp16:
-                    avg_loss.backward()
                     optimizer.step()
                 else:
-                    scaler.scale(avg_loss).backward()
                     scaler.step(optimizer)
                     scaler.update()
                 optimizer.clear_grad()
         if accumulate_grad:
             if not use_pure_fp16:
-                avg_loss.backward()
                 optimizer.step()
             else:
-                scaler.scale(avg_loss).backward()
                 scaler.step(optimizer)
                 scaler.update()
             optimizer.clear_grad()
@@ -168,48 +173,50 @@ def test_stage2_stage3():
     mlp8.set_state_dict(state_dict)
     # fp32 
     stage2_params = train_mlp(
-        mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+        mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=False)
     stage3_params = train_mlp(
-        mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=True)
+        mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=False)
+
     for i in range(len(stage2_params)):
-        for j in range(len(stage3_params)):
-            if stage2_params[i].name == stage3_params[j].name:
-                np.testing.assert_allclose(
-                    stage2_params[i].numpy(),
-                    stage3_params[j].numpy(),
-                    rtol=1e-6)
+        np.testing.assert_allclose(
+            stage2_params[i].numpy(),
+            stage3_params[i].numpy(),
+            rtol=1e-6,
+            atol=1e-6)
+
     # fp32 accumulate grad
-    stage2_params = train_mlp(
+    stage3_params = train_mlp(
         mlp3,
-        sharding_stage=2,
+        sharding_stage=3,
         use_pure_fp16=False,
         accumulate_grad=True,
         opt_group=True)
-    stage3_params = train_mlp(
+    stage3_params_add = train_mlp(
         mlp4,
         sharding_stage=3,
         use_pure_fp16=False,
         accumulate_grad=True,
+        batch_size=20,
         opt_group=True)
-    for i in range(len(stage2_params)):
-        for j in range(len(stage3_params)):
-            if stage2_params[i].name == stage3_params[j].name:
-                np.testing.assert_allclose(
-                    stage2_params[i].numpy(),
-                    stage3_params[j].numpy(),
-                    rtol=1e-6)
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage3_params[i].numpy(),
+            stage3_params_add[i].numpy(),
+            rtol=1e-6,
+            atol=1e-6)
+
     # fp16
     stage2_params = train_mlp(
         mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False)
     stage3_params = train_mlp(
         mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False)
     for i in range(len(stage2_params)):
-        for j in range(len(stage3_params)):
-            if stage2_params[i].name == stage3_params[j].name:
-                np.testing.assert_allclose(
-                    stage2_params[i].numpy(),
-                    stage3_params[j].numpy(),
-                    rtol=1e-6)
+        np.testing.assert_allclose(
+            stage2_params[i].numpy(),
+            stage3_params[i].numpy(),
+            rtol=1e-4,
+            atol=1e-4)
+
     # fp16 recompute
     stage3_params = train_mlp(
         mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
@@ -220,12 +227,8 @@ def test_stage2_stage3():
         opt_group=False,
         recompute=True)
     for i in range(len(stage3_params)):
-        for j in range(len(stage3_params_re)):
-            if stage3_params[i].name == stage3_params_re[j].name:
-                np.testing.assert_allclose(
-                    stage3_params[i].numpy(),
-                    stage3_params_re[j].numpy(),
-                    rtol=1e-6)
+        np.testing.assert_allclose(
+            stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d4b4c02068aa71c51f02dd8df74fac14e65fafc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
@@ -0,0 +1,192 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+epoch = 10
+batch_size = 32
+paddle.seed(2022)
+np.random.seed(2022)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.AdamW(
+        parameters=[{
+            "params": model.parameters()
+        }] if opt_group else model.parameters(),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model,
+              use_pure_fp16=False,
+              accumulate_grad=False,
+              offload=False,
+              convert2cpu=False):
+    group = paddle.distributed.new_group([0, 1])
+    optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+
+    if use_pure_fp16:
+        model = paddle.amp.decorate(
+            models=model, level='O2', save_dtype='float32')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+        scaler = ShardingScaler(scaler)
+
+    model = ShardingStage3(
+        model, optimizer=optimizer, group=group, offload=offload)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            if not use_pure_fp16:
+                avg_loss.backward()
+            else:
+                scaler.scale(avg_loss).backward()
+            if not accumulate_grad:
+                if not use_pure_fp16:
+                    optimizer.step()
+                else:
+                    scaler.step(optimizer)
+                    scaler.update()
+                optimizer.clear_grad()
+        if accumulate_grad:
+            if not use_pure_fp16:
+                optimizer.step()
+            else:
+                scaler.step(optimizer)
+                scaler.update()
+            optimizer.clear_grad()
+    if not convert2cpu:
+        model.get_all_parameters()
+    else:
+        model.get_all_parameters(convert2cpu)
+    return model.parameters()
+
+
+def test_stage3_offload():
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6 = MLP(), MLP(), MLP(), MLP(), MLP(
+    ), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
+
+    # fp32 offload
+    stage3_params = train_mlp(mlp1, use_pure_fp16=False)
+    stage3_params_offload = train_mlp(mlp2, use_pure_fp16=False, offload=True)
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage3_params[i].numpy(),
+            stage3_params_offload[i].numpy(),
+            rtol=1e-6,
+            atol=1e-8)
+
+    # fp16 offload
+    stage3_params = train_mlp(mlp3, use_pure_fp16=True)
+    stage3_params_offload = train_mlp(mlp4, use_pure_fp16=True, offload=True)
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage3_params[i].numpy(),
+            stage3_params_offload[i].numpy(),
+            rtol=1e-2,
+            atol=1e-2)
+
+    # fp32 accumulate grad offload
+    stage3_params = train_mlp(mlp5, use_pure_fp16=False, accumulate_grad=True)
+    stage3_params_offload = train_mlp(
+        mlp6,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        offload=True,
+        convert2cpu=True)
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage3_params[i].numpy(),
+            stage3_params_offload[i].numpy(),
+            rtol=1e-6,
+            atol=1e-8)
+    return
+
+
+if __name__ == '__main__':
+    test_stage3_offload()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 89756385486cf6690a0d1e7bd5c93462e579bdfd..f92465b739a2a760557663f53dd220ced8f82fa3 100755
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -140,6 +140,19 @@ def create_paddle_case(op_type, callback):
                 self.assertEqual((out.numpy() == self.real_result).all(), True)
                 paddle.enable_static()
 
+        def test_not_equal(self):
+            if self.op_type == "not_equal":
+                paddle.disable_static()
+                x = paddle.to_tensor(
+                    np.array([1.2e-8, 2, 2, 1]), dtype="float32")
+                y = paddle.to_tensor(
+                    np.array([1.1e-8, 2, 2, 1]), dtype="float32")
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                self.real_result = np.array([0, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((out.numpy() == self.real_result).all(), True)
+                paddle.enable_static()
+
         def test_assert(self):
             def test_dynamic_api_string(self):
                 if self.op_type == "equal":
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
index f76dcb5687c2ab77e411e7ef3c4de64200d99c66..669ab7d8f7f342653bef7cb6b48abf75ee6b2d11 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -23,10 +23,10 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
 class TestDygraphShardingStage2(TestMultipleGpus):
 
     # check sharding logic as well as the accuracy with single mode
-    def test_dygraph_sharding_optimizer_stage2(self):
+    def test_dygraph_sharding_stage2(self):
         self.run_mnist_2gpu('dygraph_sharding_stage2.py')
 
-    def test_dygraph_sharding_optimizer_stage2_offload(self):
+    def test_dygraph_sharding_stage2_offload(self):
         self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py')
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
index 89d5f2e8c7b292592369651887fc72bcabcb77ea..c7da5d1e941b43c6ae28b2a5a84a59bbea311a24 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
@@ -23,9 +23,12 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
 class TestDygraphShardingStage3(TestMultipleGpus):
 
     # check sharding logic as well as the accuracy with single mode
-    def test_dygraph_sharding_optimizer_stage3(self):
+    def test_dygraph_sharding_stage3(self):
         self.run_mnist_2gpu('dygraph_sharding_stage3.py')
 
+    def test_dygraph_sharding_stage3_offload(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
index 147824f341be43df33699b3a918880979c24485d..a36c0bf071332e3d5c67843b3084787035118cba 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
@@ -216,6 +216,144 @@ def adam_step(inputs, attributes):
     return param_out, moment1_out, moment2_out
 
 
+def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
+                     lazy_mode):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    # grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = np.zeros(shape=[height, row_numel])
+    moment2_out = np.zeros(shape=[height, row_numel])
+    param_out = np.zeros(shape=[height, row_numel])
+
+    def update_row(row_id, update_value):
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
+                                                         ) * update_value
+        moment2_out[row_id] = beta2 * moment2[row_id] + (
+            1 - beta2) * np.square(update_value)
+        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
+            np.sqrt(moment2_out[row_id]) + epsilon))
+
+    if lazy_mode:
+        for idx, row_id in enumerate(rows):
+            update_row(row_id, np_grad[idx])
+    else:
+        for row_id in range(param_out.shape[0]):
+            update_value = np.zeros(np_grad[0].shape).astype("float32")
+            if row_id in rows:
+                update_value = np_grad[rows.index(row_id)]
+            update_row(row_id, update_value)
+
+    return param_out, moment1_out, moment2_out
+
+
+class TestSparseAdamOp(unittest.TestCase):
+    def setup(self, scope, place, lazy_mode):
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = np.array([beta1**10]).astype("float32")
+        beta2_pow = np.array([beta2**10]).astype("float32")
+
+        height = 10
+        rows = [0, 4, 7]
+        self.rows = rows
+        row_numel = 12
+        self.row_numel = row_numel
+        self.dense_inputs = {
+            "Param": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
+            'Beta1Pow': beta1_pow,
+            'Beta2Pow': beta2_pow,
+            "LearningRate": np.full((1), 2.0).astype("float32")
+        }
+        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
+        self.attrs = {
+            'epsilon': epsilon,
+            'beta1': beta1,
+            'beta2': beta2,
+            'min_row_size_to_use_multithread': 2
+        }
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        self.sparse_inputs = ["Grad"]
+
+        param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
+                                                 height, rows, row_numel,
+                                                 np_array, lazy_mode)
+        self.outputs = {
+            "ParamOut": param_out,
+            "Moment1Out": mom1,
+            "Moment2Out": mom2,
+            'Beta1PowOut': beta1_pow * beta1,
+            'Beta2PowOut': beta2_pow * beta2
+        }
+
+    def check_with_place(self, place, lazy_mode):
+        scope = core.Scope()
+        self.setup(scope, place, lazy_mode)
+
+        op_args = dict()
+        op_args['lazy_mode'] = lazy_mode
+        for key, np_array in self.dense_inputs.items():
+            var = scope.var(key).get_tensor()
+            var.set(np_array, place)
+            op_args[key] = key
+        for s in self.sparse_inputs:
+            op_args[s] = s
+        for s in self.outputs:
+            var = scope.var(s).get_tensor()
+            var.set(self.init_output, place)
+            op_args[s] = s
+        for k in self.attrs:
+            op_args[k] = self.attrs[k]
+
+        # create and run adam operator
+        adam_op = Operator("adam", **op_args)
+        adam_op.run(scope, place)
+
+        for key, np_array in self.outputs.items():
+            out_var = scope.var(key).get_tensor()
+            actual = np.array(out_var)
+            actual = actual.reshape([actual.size])
+            np_array = np_array.reshape([np_array.size])
+
+            for i in range(np_array.size):
+                self.assertLess((actual[i] - np_array[i]), 0.00001)
+
+    def test_sparse_adam(self):
+        xpu_version = core.get_xpu_device_version(0)
+        version_str = "xpu2" if xpu_version == core.XPUVersion.XPU2 else "xpu1"
+        if "xpu2" == version_str:
+            self.check_with_place(paddle.XPUPlace(0), False)
+
+
 class TestAdamOpBetaVariable(OpTest):
     def setUp(self):
         '''Test Adam Op with beta as Variable
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 2b64337799cebdc0f2b5c0ce3580eb37ca12caa4..f37b45eef1b80211cbb749c20b489af43cdafdee 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -1,7 +1,7 @@
 - api : add
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -10,7 +10,7 @@
 - api : cast
   args : (const Tensor& x, DataType out_dtype)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : CastInferMeta
   kernel :
     func : cast
@@ -38,7 +38,7 @@
 - api : divide
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -47,31 +47,31 @@
 - api : dot
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : DotInferMeta
-  kernel : 
+  kernel :
     func : dot
 
 - api : empty
   args : (const ScalarArray& shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateInferMeta
     param : [shape, dtype, layout]
-  kernel : 
+  kernel :
     func : empty
     param : [shape]
     data_type : dtype
     backend : place
     layout : layout
-  
+
 - api : empty_like
   args : (const Tensor& x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateLikeInferMeta
     param : [x, dtype, layout]
-  kernel : 
+  kernel :
     func : empty_like
     param : []
     data_type : dtype > x
@@ -81,31 +81,31 @@
 - api : flatten
   args : (const Tensor& x, int start_axis, int stop_axis)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : FlattenInferMeta
-  kernel : 
+  kernel :
     func : flatten
 
 - api : full
   args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateInferMeta
     param : [shape, dtype, layout]
-  kernel : 
+  kernel :
     func : full
     param : [shape, value]
     data_type : dtype
     backend : place
     layout : layout
-  
+
 - api : full_like
   args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateLikeInferMeta
     param : [x, dtype, layout]
-  kernel : 
+  kernel :
     func : full_like
     param : [value]
     data_type : dtype > x
@@ -115,25 +115,25 @@
 - api : matmul
   args : (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : MatmulInferMeta
-  kernel : 
+  kernel :
     func : matmul
   backward : matmul_grad
 
 - api : mean
   args : (const Tensor& x, const std::vector<int64_t>& axis={}, bool keep_dim=false)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ReduceInferMeta
     param: [x, axis, keep_dim]
-  kernel : 
+  kernel :
     func : mean
 
 - api : multiply
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -147,15 +147,15 @@
 - api : reshape
   args : (const Tensor& x, const ScalarArray& shape)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ReshapeInferMeta
-  kernel : 
+  kernel :
     func : reshape
 
 - api : scale
   args : (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : UnchangedInferMeta
     param : [x]
   kernel :
@@ -164,7 +164,7 @@
 - api : subtract
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -173,10 +173,10 @@
 - api : sum
   args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ReduceInferMeta
     param: [x, axis, keep_dim, dtype]
-  kernel : 
+  kernel :
     func : sum
     param : [x, axis, keep_dim, dtype]
     data_type : x
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index c99473158524637de112289e58182cd14bea60fc..6bb02ab9d40dbe28b01bf669417a8d521c6458da 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,8 @@ import os
 import yaml
 import argparse
 
+import gen_utils
+
 
 class API:
     prefix_tensor_name = 'dense_'
@@ -23,12 +25,12 @@ class API:
     def __init__(self, api_item_yaml):
         self.api = api_item_yaml['api']
         # args:
-        #   inputs: 
+        #   inputs:
         #     names : [], list of input names
         #   attrs:
         #     names : [], list of attribute names
-        #     attr_info : { attr_name : (type, default_values)}    
-        self.args = self.parse_args(api_item_yaml['args'])
+        #     attr_info : { attr_name : (type, default_values)}
+        self.args = gen_utils.parse_args(self.api, api_item_yaml['args'])
         self.output = api_item_yaml['output']
         self.is_base_api = True
         if 'invoke' in api_item_yaml:
@@ -50,271 +52,29 @@ class API:
             if 'param' not in self.infer_meta:
                 self.infer_meta['param'] = None
 
-    def parse_args(self, args_str):
-        inputs = {'names': []}
-        attrs = {'names': [], 'attr_info': {}}
-        args_str = args_str.strip()
-        assert args_str.startswith('(') and args_str.endswith(')'), \
-            f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml."
-        args_str = args_str[1:-1]
-        args_list = args_str.split(',')
-        input_types = [
-            'const Tensor&', 'const Tensor &', 'const std::vector<Tensor>&',
-            'const std::vector<Tensor> &'
-        ]
-        attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
-                      'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
-                      'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
-        args_declare_str = ""
-        args_define_str = ""
-        for item in args_list:
-            item = item.strip()
-            # match the input tensor
-            has_input = False
-            for in_type in input_types:
-                if item.startswith(in_type):
-                    input_name = item[len(in_type):].strip()
-                    assert len(input_name) > 0, \
-                        f"The input tensor name should not be empty. Please check the args of {self.api} in api.yaml."
-                    inputs['names'].append(input_name)
-                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
-                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
-                    has_input = True
-                    break
-            if has_input:
-                continue
-
-            # match the attribute
-            for attr_type in attr_types:
-                if item.startswith(attr_type):
-                    attr_name = item[len(attr_type):].strip()
-                    assert len(attr_name) > 0, \
-                        f"The attribute name should not be empty. Please check the args of {self.api} in api.yaml."
-                    default_value = None
-                    if '=' in attr_name:
-                        attr_infos = attr_name.split('=')
-                        attr_name = attr_infos[0].strip()
-                        default_value = attr_infos[1].strip()
-
-                    default_value_str = "" if default_value is None else '=' + default_value
-                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
-                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
-                    attrs['names'].append(attr_name)
-                    attrs['attr_info'][attr_name] = (attr_type, default_value)
-                    break
-
-        args = {
-            'inputs': inputs,
-            'attrs': attrs,
-            'args_declare': args_declare_str[:-2],
-            'args_define': args_define_str[:-2]
-        }
-        return args
-
     def gene_api_declaration(self):
         return f"""
 PADDLE_API {self.output} {self.api}({self.args['args_declare']});
 """
 
-    def gene_kernel_select(self, input_names, attrs, kernel):
-
-        kernel_key_item_init = """
-  Backend kernel_backend = Backend::UNDEFINED;
-  DataLayout kernel_layout = DataLayout::UNDEFINED;
-  DataType kernel_data_type = DataType::UNDEFINED;
-"""
-        # Check the tensor options
-        attr_backend_count = 0
-        attr_layout_count = 0
-        attr_data_type_count = 0
-        for attr_name in attrs['names']:
-            if attrs['attr_info'][attr_name][0] == 'Backend':
-                assert kernel['backend'] is not None, \
-                    f"{self.api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
-                attr_backend_count = attr_backend_count + 1
-            if attrs['attr_info'][attr_name][0] == 'DataLayout':
-                assert kernel['layout'] is not None, \
-                    f"{self.api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
-                attr_layout_count = attr_layout_count + 1
-            if attrs['attr_info'][attr_name][0] == 'DataType':
-                assert kernel['data_type'] is not None, \
-                    f"{self.api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
-                attr_data_type_count = attr_data_type_count + 1
-
-        # preprocess kernel configures
-        kernel_select_code = ""
-        if kernel['backend'] is not None:
-            if '>' in kernel['backend']:
-                vars_list = kernel['backend'].split('>')
-                assert len(
-                    vars_list
-                ) == 2, f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
-                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
-                    f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
-"""
-
-            else:
-                args_str = ""
-                for ele in kernel['backend'].split(','):
-                    args_str = args_str + ele.strip() + ', '
-                kernel_select_code = kernel_select_code + f"""
-  kernel_backend = ParseBackend({args_str[:-2]});
-"""
-
-        if kernel['layout'] is not None:
-            if '>' in kernel['layout']:
-                vars_list = kernel['layout'].split('>')
-                assert len(
-                    vars_list
-                ) == 2, f"{self.api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
-                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
-                    f"{self.api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
-"""
-
-            else:
-                vars_list = kernel['layout'].split(',')
-                assert len(
-                    vars_list
-                ) == 1, f"{self.api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_layout = ParseLayout({vars_list[0].strip()});
-"""
-
-        if kernel['data_type'] is not None:
-            if '>' in kernel['data_type']:
-                vars_list = kernel['data_type'].split('>')
-                assert len(
-                    vars_list
-                ) == 2, f"{self.api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
-                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
-                    f"{self.api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
-"""
-
-            else:
-                vars_list = kernel['data_type'].split(',')
-                assert len(
-                    vars_list
-                ) == 1, f"{self.api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_data_type = ParseDataType({vars_list[0].strip()});
-"""
-
-        if len(input_names) == 0:
-            assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
-                f"{self.api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
-
-        kernel_select_args = ""
-        for input_name in input_names:
-            kernel_select_args = kernel_select_args + input_name + ", "
-
-        if len(kernel_select_args) > 2:
-            kernel_select_args = kernel_select_args[:-2]
-
-        kernel_select_code = kernel_key_item_init + kernel_select_code
-
-        if len(input_names) > 0:
-            kernel_select_code = kernel_select_code + f"""
-  if (kernel_backend == Backend::UNDEFINED 
-        || kernel_layout == DataLayout::UNDEFINED
-        || kernel_data_type == DataType::UNDEFINED ) {{
-    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-    if (kernel_backend == Backend::UNDEFINED) {{
-      kernel_backend = kernel_key.backend();
-    }}
-    if (kernel_layout == DataLayout::UNDEFINED) {{
-      kernel_layout = kernel_key.layout();
-    }}
-    if (kernel_data_type == DataType::UNDEFINED) {{
-      kernel_data_type = kernel_key.dtype();
-    }}
-  }}"""
-
-        kernel_select_code = kernel_select_code + f"""
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
-  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
-  VLOG(6) << "{self.api} API kernel: " << kernel;"""
-
-        return kernel_select_code
-
-    def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str:
-        infer_meta_params = infer_meta['param'] if infer_meta[
-            'param'] is not None else input_names + attr_names
-        param_code = ""
-        for param in infer_meta_params:
-            if param in input_names:
-                param_code = param_code + "GetDenseTensorMeta(" + self.prefix_tensor_name + param + "), "
-            elif param in attr_names:
-                param_code = param_code + param + ", "
-            elif isinstance(param, str):
-                param_code = param_code + "\"" + param + "\", "
-            elif isinstance(param, bool):
-                param_code = param_code + str(param).lower() + ", "
-            else:
-                param_code = param_code + str(param) + ", "
-
-        param_code = param_code[:-2]
-        return f"""
-  auto out_meta = pten::{infer_meta['func']}({param_code});
-"""
-
-    def get_kernel_args(self, input_names, attrs, kernel_param):
-        input_tensor_code = ""
-        for input_name in input_names:
-            # set input code
-            input_tensor_code = input_tensor_code + f"""
-  auto {self.prefix_tensor_name}{input_name} = TensorToDenseTensor({input_name});"""
-
-        attr_names = attrs['names']
-        if kernel_param is None:
-            kernel_param = input_names + attr_names
-
-        kernel_args = "*dev_ctx, "
-        for param in kernel_param:
-            if param in input_names:
-                kernel_args = kernel_args + "*" + self.prefix_tensor_name + param + ", "
-            elif param in attr_names:
-                # set attr for kernel_context
-                if 'ScalarArray' in attrs['attr_info'][param][0]:
-                    param = 'pten::ScalarArray(' + param + ')'
-                elif 'Scalar' in attrs['attr_info'][param][0]:
-                    param = 'pten::Scalar(' + param + ')'
-                kernel_args = kernel_args + param + ", "
-            elif isinstance(param, bool):
-                kernel_args = kernel_args + str(param).lower() + ", "
-            else:
-                kernel_args = kernel_args + str(param) + ", "
-        return input_tensor_code, kernel_args[:-2]
-
     def gene_api_code(self):
         if self.is_base_api:
-            input_tensors, kernel_args = self.get_kernel_args(
+            input_tensors, kernel_args = gen_utils.get_kernel_args(
                 self.args['inputs']['names'], self.args['attrs'],
                 self.kernel['param'])
+            out_type, _ = gen_utils.parse_output(self.api, self.output)
+            outputs_args, output_create = gen_utils.gene_output(out_type)
             return f"""
 PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
-{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+{gen_utils.gene_kernel_select(self.api, self.args['inputs']['names'], self.args['attrs'], self.kernel)}
 
   auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
 {input_tensors}
-{self.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
-  auto dense_out = std::make_shared<pten::DenseTensor>(
-        pten::make_intrusive<paddle::experimental::SharedStorage>(
-            pten::TransToFluidPlace(kernel_backend)),
-        std::move(out_meta));
-
-  Tensor out;
-  out.set_impl(dense_out);
+{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
+{output_create}
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<pten::{self.api}_kernel>();
-  (*kernel_fn)({kernel_args}, dense_out.get());
+  (*kernel_fn)({kernel_args}, {outputs_args});
 
   return out;
 }}
@@ -330,6 +90,8 @@ PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
 
 def header_include():
     return """
+#include <tuple>
+
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
@@ -345,6 +107,7 @@ def source_include(header_file_path):
 
 #include "paddle/pten/api/include/kernel_signature.h"
 #include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/api_utils.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -358,9 +121,6 @@ def source_include(header_file_path):
 
 def api_register():
     return """
-PT_REGISTER_API(Creation);
-PT_REGISTER_API(Linalg);
-PT_REGISTER_API(Manipulation);
 PT_REGISTER_API(Math);
 """
 
@@ -377,35 +137,6 @@ namespace experimental {
 """)
 
 
-def tensor_to_densetensor():
-    return """
-  std::shared_ptr<pten::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
-      return std::dynamic_pointer_cast<pten::DenseTensor>(tensor.impl());
-  }
-
-  std::shared_ptr<std::vector<pten::DenseTensor>> TensorToDenseTensor(const std::vector<Tensor>& tensors) {
-      std::vector<pten::DenseTensor> pt_tensors;
-
-      for(auto & t : tensors) {
-          pt_tensors.push_back(*std::dynamic_pointer_cast<pten::DenseTensor>(t.impl()));
-      }
-      return std::make_shared<std::vector<pten::DenseTensor>>(pt_tensors);
-  }
-
-   const pten::DenseTensorMeta GetDenseTensorMeta(const std::shared_ptr<pten::DenseTensor> & x) {
-       return x->meta();
-   }
-
-   const std::vector<pten::DenseTensorMeta> GetDenseTensorMeta(const std::shared_ptr<std::vector<pten::DenseTensor>>& x) {
-       std::vector<pten::DenseTensorMeta> metas;
-       for(auto& t : *x) {
-           metas.push_back(t.meta());
-       }
-       return metas;
-   }
-"""
-
-
 def generate_api(api_yaml_path, header_file_path, source_file_path):
 
     with open(api_yaml_path, 'r') as f:
@@ -422,7 +153,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
     include_header_file = "paddle/pten/api/include/api.h"
     source_file.write(source_include(include_header_file))
     source_file.write(namespace[0])
-    source_file.write(tensor_to_densetensor())
 
     for api in apis:
         api_code = API(api)
@@ -443,7 +173,7 @@ def main():
         description='Generate PaddlePaddle C++ API files')
     parser.add_argument(
         '--api_yaml_path',
-        help='path to yaml file directory',
+        help='path to api yaml file',
         default='python/paddle/utils/code_gen/api.yaml')
     parser.add_argument(
         '--api_header_path',
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26da7ae2adfaceaffe90aa203ec78bd0edb14b61
--- /dev/null
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -0,0 +1,34 @@
+- backward_api : matmul_grad
+  forward : matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out)
+  args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : MatmulGradInferMeta
+  kernel :
+    func : matmul_grad
+
+- backward_api : scale_grad
+  forward : scale (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale) -> Tensor(out)
+  args : (const Tensor& out_grad, const Scalar& scale, float bias=0.0, bool bias_after_scale=true)
+  output : Tensor(x_grad)
+  invoke : scale(out_grad, scale, bias, bias_after_scale)
+
+# TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
+#
+# - backward_api : matmul_double_grad
+#   forward : matmul_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x, bool transpose_y) -> tuple<Tensor, Tensor>(dx, dy)
+#   args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y)
+#   output : tuple<Tensor, Tensor, Tensor>  // d2x, d2y, dout_grad
+#   infer_meta :
+#     func : MatmulDoubleGradInferMeta
+#   kernel :
+#     func : matmul_double_grad
+
+# - backward_api : matmul_triple_grad
+#   forward : matmul_double_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y) -> tuple<Tensor, Tensor, Tensor>(d2x, d2y, dout_grad)
+#   args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, const Tensor& d2x_grad, const Tensor& d2y_grad, const Tensor& dout_grad_grad, bool transpose_x, bool transpose_y)
+#   output : tuple<Tensor, Tensor, Tensor, Tensor, Tensor>  // d3x, d3y, d2out_grad, ddx_grad, ddy_grad
+#   infer_meta :
+#     func : MatmulTripleGradInferMeta
+#   kernel :
+#     func : matmul_triple_grad
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cb14327f6e09092bbce0229ae26f1b456238802
--- /dev/null
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+import gen_utils
+
+
+class BackwardAPI:
+    def __init__(self, backward_item_yaml):
+        self.backward_api = backward_item_yaml['backward_api']
+        self.args, self.output_type, self.return_comment = self.parse_and_check_args(
+            backward_item_yaml['forward'], backward_item_yaml['args'],
+            backward_item_yaml['output'])
+
+        self.is_base_api = True
+        if 'invoke' in backward_item_yaml:
+            self.is_base_api = False
+            self.invoke = backward_item_yaml['invoke']
+        else:
+            self.kernel = backward_item_yaml['kernel']
+            if 'backend' not in self.kernel or len(self.kernel['backend']) == 0:
+                self.kernel['backend'] = None
+            if 'layout' not in self.kernel or len(self.kernel['layout']) == 0:
+                self.kernel['layout'] = None
+            if 'data_type' not in self.kernel or len(self.kernel[
+                    'data_type']) == 0:
+                self.kernel['data_type'] = None
+            if 'param' not in self.kernel or len(self.kernel['param']) == 0:
+                self.kernel['param'] = None
+
+            self.infer_meta = backward_item_yaml['infer_meta']
+            if 'param' not in self.infer_meta or len(self.infer_meta[
+                    'param']) == 0:
+                self.infer_meta['param'] = None
+
+    def parse_forward_config(self, forward_config):
+        # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
+        result = re.search(
+            r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->[^\(]*\((?P<outputs>[^\)]+)\)",
+            forward_config)
+        api = result.group('api')
+        outputs = [item.strip() for item in result.group('outputs').split(',')]
+        forward_args = gen_utils.parse_args(api, result.group('args'))
+
+        return api, forward_args['inputs'], forward_args['attrs'], outputs
+
+    def parse_and_check_args(self, forward_config, args_config, output_config):
+        # parse the forward and backward config
+        _, fw_inputs, fw_attrs, fw_outputs = self.parse_forward_config(
+            forward_config)
+        bw_args = gen_utils.parse_args(self.backward_api, args_config)
+
+        # check the inputs of backward
+        for input in bw_args['inputs']['names']:
+            if input not in fw_inputs and input not in fw_outputs:
+                if input.endswith('_grad'):
+                    original_name = input[:-5]
+                    assert original_name in fw_outputs, \
+                        f"{self.backward_api} : Input Tensor error: the input tensor({input}) of backward should be an input or output or grad of output in forward api. \
+                         Please check the forward of {self.backward_api} in yaml."
+
+        # check the attributes of backward
+        for attr in bw_args['attrs']['names']:
+            assert attr in fw_attrs['names'] and bw_args['attrs']['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
+                f"{self.backward_api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
+                 Please check the args of {self.backward_api} in yaml."
+
+        # check the output of backward
+        output_type, return_comment = gen_utils.parse_output(self.backward_api,
+                                                             output_config)
+        assert output_type.count('Tensor') <= len(fw_inputs['names']), \
+            f"{self.backward_api} : Output error: The number of ouputs should be less then the number of inputs of forward api. \
+             Please check the output of {self.backward_api} in yaml."
+
+        return bw_args, output_type, return_comment
+
+    def gene_api_declaration(self):
+        if self.return_comment:
+            return f"""
+// {self.return_comment}
+{self.output_type} {self.backward_api}({self.args['args_declare']});
+"""
+
+        else:
+            return f"""
+{self.output_type} {self.backward_api}({self.args['args_declare']});
+"""
+
+    def gene_api_code(self):
+        if self.is_base_api:
+            input_tensors, kernel_args = gen_utils.get_kernel_args(
+                self.args['inputs']['names'], self.args['attrs'],
+                self.kernel['param'])
+            outputs_args, output_create = gen_utils.gene_output(
+                self.output_type)
+            return f"""
+// {self.return_comment}
+{self.output_type} {self.backward_api}({self.args["args_define"]}) {{
+{gen_utils.gene_kernel_select(self.backward_api, self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+{input_tensors}
+{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
+{output_create}
+
+  auto* kernel_fn = kernel.GetVariadicKernelFn<pten::{self.backward_api}_kernel>();
+  (*kernel_fn)({kernel_args}, {outputs_args});
+
+  return out;
+}}
+"""
+
+        else:
+            inveke_func_name = self.invoke.split('(')[0].strip()
+            if inveke_func_name in self.args['attrs']['names']:
+                # Adjust the param whose name is same with api invoked.
+                pattern = '\W' + inveke_func_name + '[^A-Za-z0-9_(]'
+
+                def adjust_name(matched):
+                    matched_str = matched.group()
+                    return matched_str[0:-1] + '_val' + matched_str[-1]
+
+                invoke_code = re.sub(pattern, adjust_name, self.invoke)
+                params_code = re.sub(pattern, adjust_name,
+                                     self.args["args_define"])
+            else:
+                invoke_code = self.invoke
+                params_code = self.args["args_define"]
+            return f"""
+// {self.return_comment}
+{self.output_type} {self.backward_api}({params_code}) {{
+  return {invoke_code};
+}}
+"""
+
+
+def header_include():
+    return """
+#include <tuple>
+
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/include/kernel_signature.h"
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/api_utils.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/api/include/api.h"
+#include "paddle/pten/infermeta/backward.h"
+"""
+
+
+def backward_api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+
+""", """
+
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_backward_api(backward_yaml_path, header_file_path,
+                          source_file_path):
+
+    with open(backward_yaml_path, 'r') as f:
+        bw_apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = backward_api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/pten/api/backward/backward_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for bw_api in bw_apis:
+        bw_api = BackwardAPI(bw_api)
+        # print(api_code.gene_api_declaration())
+        header_file.write(bw_api.gene_api_declaration())
+        source_file.write(bw_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ backward API files')
+    parser.add_argument(
+        '--backward_yaml_path',
+        help='path to backward yaml file',
+        default='python/paddle/utils/code_gen/backward.yaml')
+    parser.add_argument(
+        '--backward_header_path',
+        help='output of generated backward header code file',
+        default='paddle/pten/api/backward/backward_api.h')
+
+    parser.add_argument(
+        '--backward_source_path',
+        help='output of generated backward source code file',
+        default='paddle/pten/api/lib/backward_api.cc')
+
+    options = parser.parse_args()
+
+    backward_yaml_path = options.backward_yaml_path
+    header_file_path = options.backward_header_path
+    source_file_path = options.backward_source_path
+
+    generate_backward_api(backward_yaml_path, header_file_path,
+                          source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/utils/code_gen/gen_utils.py b/python/paddle/utils/code_gen/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d368c292b7cfefb0121aba9f0c0fcdc7b0a4caf
--- /dev/null
+++ b/python/paddle/utils/code_gen/gen_utils.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+PREFIX_TENSOR_NAME = 'dense_'
+
+
+def parse_args(api_name, args_str):
+    """
+    Returns:
+       { inputs : {
+             names : [] // list of input names
+             input_info : { input_name : type }
+         }
+         attrs: {
+             names : [] // list of attribute names
+             attr_info : { attr_name : (type, default_value)}
+         }
+         args_declare : "str" // str of funtion params with default value. Example: (..., bool flag=false)
+         args_define : "str" // str of funtion params without default value. Example: (..., bool flag)
+       }
+    """
+    inputs = {'names': [], 'input_info': {}}
+    attrs = {'names': [], 'attr_info': {}}
+    args_str = args_str.strip()
+    assert args_str.startswith('(') and args_str.endswith(')'), \
+        f"Args declaration should start with '(' and end with ')', please check the args of {api_name} in yaml."
+    args_str = args_str[1:-1]
+    args_list = args_str.split(',')
+    input_types = [
+        'const Tensor&', 'const Tensor &', 'const std::vector<Tensor>&',
+        'const std::vector<Tensor> &'
+    ]
+    attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
+                  'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
+                  'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
+    args_declare_str = ""
+    args_define_str = ""
+
+    for item in args_list:
+        item = item.strip()
+        # match the input tensor
+        has_input = False
+        for in_type in input_types:
+            if item.startswith(in_type):
+                input_name = item[len(in_type):].strip()
+                assert len(input_name) > 0, \
+                    f"The input tensor name should not be empty. Please check the args of {api_name} in yaml."
+                assert len(attrs['names']) == 0, \
+                    f"The input Tensor should appear before attributes. please check the position of {api_name}:input({input_name}) in yaml"
+
+                inputs['names'].append(input_name)
+                inputs['input_info'][input_name] = in_type
+                args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
+                args_define_str = args_define_str + in_type + ' ' + input_name + ', '
+                has_input = True
+                break
+        if has_input:
+            continue
+
+        # match the attribute
+        for attr_type in attr_types:
+            if item.startswith(attr_type):
+                attr_name = item[len(attr_type):].strip()
+                assert len(attr_name) > 0, \
+                    f"The attribute name should not be empty. Please check the args of {api_name} in yaml."
+                default_value = None
+                if '=' in attr_name:
+                    attr_infos = attr_name.split('=')
+                    attr_name = attr_infos[0].strip()
+                    default_value = attr_infos[1].strip()
+
+                default_value_str = "" if default_value is None else '=' + default_value
+                args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
+                args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
+                attrs['names'].append(attr_name)
+                attrs['attr_info'][attr_name] = (attr_type, default_value)
+                break
+
+    args = {
+        'inputs': inputs,
+        'attrs': attrs,
+        'args_declare': args_declare_str[:-2],
+        'args_define': args_define_str[:-2]
+    }
+    return args
+
+
+def parse_output(api_name, output_config):
+    def parse_output_item(output_item):
+        alllowd_output_types = ['Tensor', 'std::vector<Tensor>']
+        if re.search(r'\(\w*\)', output_item):
+            result = re.search(
+                r"(?P<out_type>[a-zA-Z0-9_<>]+)\s*\((?P<name>\w+)\)",
+                output_item)
+            out_type = result.group('out_type')
+            assert out_type in alllowd_output_types, \
+                f"{api_name} : Output type error: the output type only support Tensor and std::vector<Tensor>, \
+                  but now is {out_type}."
+
+            return out_type, result.group('name')
+
+        else:
+            if output_item.strip() in alllowd_output_types:
+                return output_item.strip(), 'out'
+            else:
+                raise ValueError(
+                    "{} : Output type error: the output type only support Tensor and std::vector<Tensor>, \
+                  but now is {}.".format(api_name, out_type))
+
+    temp_list = output_config.split(',')
+
+    if len(temp_list) == 1:
+        out_type, out_name = parse_output_item(temp_list[0])
+        return out_type, out_name
+    else:
+        out_type_list = []
+        out_name_list = []
+        for output_item in temp_list:
+            out_type, out_name = parse_output_item(output_item)
+            out_type_list.append(out_type)
+            out_name_list.append(out_name)
+
+        return "std::tuple<" + ",".join(out_type_list) + ">", ", ".join(
+            out_name_list)
+
+
+def gene_kernel_select(api, input_names, attrs, kernel) -> str:
+
+    kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+"""
+    # Check the tensor options
+    attr_backend_count = 0
+    attr_layout_count = 0
+    attr_data_type_count = 0
+    for attr_name in attrs['names']:
+        if attrs['attr_info'][attr_name][0] == 'Backend':
+            assert kernel['backend'] is not None, \
+                f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+            attr_backend_count = attr_backend_count + 1
+        if attrs['attr_info'][attr_name][0] == 'DataLayout':
+            assert kernel['layout'] is not None, \
+                f"{api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
+            attr_layout_count = attr_layout_count + 1
+        if attrs['attr_info'][attr_name][0] == 'DataType':
+            assert kernel['data_type'] is not None, \
+                f"{api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
+            attr_data_type_count = attr_data_type_count + 1
+
+    # preprocess kernel configures
+    kernel_select_code = ""
+    if kernel['backend'] is not None:
+        if '>' in kernel['backend']:
+            vars_list = kernel['backend'].split('>')
+            assert len(
+                vars_list
+            ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+            assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
+                f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+        else:
+            args_str = ""
+            for ele in kernel['backend'].split(','):
+                args_str = args_str + ele.strip() + ', '
+            kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackend({args_str[:-2]});
+"""
+
+    if kernel['layout'] is not None:
+        if '>' in kernel['layout']:
+            vars_list = kernel['layout'].split('>')
+            assert len(
+                vars_list
+            ) == 2, f"{api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
+            assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
+                f"{api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+        else:
+            vars_list = kernel['layout'].split(',')
+            assert len(
+                vars_list
+            ) == 1, f"{api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayout({vars_list[0].strip()});
+"""
+
+    if kernel['data_type'] is not None:
+        if '>' in kernel['data_type']:
+            vars_list = kernel['data_type'].split('>')
+            assert len(
+                vars_list
+            ) == 2, f"{api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
+            assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
+                f"{api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+        else:
+            vars_list = kernel['data_type'].split(',')
+            assert len(
+                vars_list
+            ) == 1, f"{api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataType({vars_list[0].strip()});
+"""
+
+    if len(input_names) == 0:
+        assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
+            f"{api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
+
+    kernel_select_args = ""
+    for input_name in input_names:
+        kernel_select_args = kernel_select_args + input_name + ", "
+
+    if len(kernel_select_args) > 2:
+        kernel_select_args = kernel_select_args[:-2]
+
+    kernel_select_code = kernel_key_item_init + kernel_select_code
+
+    if len(input_names) > 0:
+        kernel_select_code = kernel_select_code + f"""
+  if (kernel_backend == Backend::UNDEFINED 
+        || kernel_layout == DataLayout::UNDEFINED
+        || kernel_data_type == DataType::UNDEFINED ) {{
+    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {{
+      kernel_backend = kernel_key.backend();
+    }}
+    if (kernel_layout == DataLayout::UNDEFINED) {{
+      kernel_layout = kernel_key.layout();
+    }}
+    if (kernel_data_type == DataType::UNDEFINED) {{
+      kernel_data_type = kernel_key.dtype();
+    }}
+  }}"""
+
+    kernel_select_code = kernel_select_code + f"""
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{api} API kernel: " << kernel;"""
+
+    return kernel_select_code
+
+
+def gene_infer_meta(input_names, attr_names, infer_meta) -> str:
+    infer_meta_params = infer_meta['param'] if infer_meta[
+        'param'] is not None else input_names + attr_names
+    param_code = ""
+    for param in infer_meta_params:
+        if param in input_names:
+            param_code = param_code + "GetDenseTensorMeta(*" + PREFIX_TENSOR_NAME + param + "), "
+        elif param in attr_names:
+            param_code = param_code + param + ", "
+        elif isinstance(param, str):
+            param_code = param_code + "\"" + param + "\", "
+        elif isinstance(param, bool):
+            param_code = param_code + str(param).lower() + ", "
+        else:
+            param_code = param_code + str(param) + ", "
+
+    param_code = param_code[:-2]
+    return f"""
+  auto out_meta = pten::{infer_meta['func']}({param_code});
+"""
+
+
+def get_kernel_args(input_names, attrs, kernel_param):
+    input_tensor_code = ""
+    for input_name in input_names:
+        # set input code
+        input_tensor_code = input_tensor_code + f"""
+  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToDenseTensor({input_name});"""
+
+    attr_names = attrs['names']
+    if kernel_param is None:
+        kernel_param = input_names + attr_names
+
+    kernel_args = "*dev_ctx, "
+    for param in kernel_param:
+        if param in input_names:
+            kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+        elif param in attr_names:
+            # set attr for kernel_context
+            if 'ScalarArray' in attrs['attr_info'][param][0]:
+                param = 'pten::ScalarArray(' + param + ')'
+            elif 'Scalar' in attrs['attr_info'][param][0]:
+                param = 'pten::Scalar(' + param + ')'
+            kernel_args = kernel_args + param + ", "
+        elif isinstance(param, bool):
+            kernel_args = kernel_args + str(param).lower() + ", "
+        else:
+            kernel_args = kernel_args + str(param) + ", "
+    return input_tensor_code, kernel_args[:-2]
+
+
+def gene_output(output_type):
+    kernel_output = ""
+    output_create = f"""
+  {output_type} out;"""
+
+    if output_type == 'Tensor' or output_type == 'std::vector<Tensor>':
+        kernel_output = 'dense_out'
+        output_create = output_create + """
+  auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);"""
+    elif re.match(r'std::tuple<.*>$', output_type):
+        out_num = output_type.count('Tensor')
+        for i in range(out_num):
+            kernel_output = kernel_output + f'dense_out_{i}, '
+            output_create = output_create + f"""
+  auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, &std::get<{i}>(out));"""
+
+        kernel_output = kernel_output[:-2]
+
+    return kernel_output, output_create