diff --git a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc
index 1fed263394b492099fe47a907e08603a9ea2de5d..b6418ef30d50f20679b798b2054c9ee6f7b2aff1 100644
--- a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h"
 
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index c4617138553917f7644006fb27cfcd7ddb00111b..5ee8b2c7efbb26db49a24c1d150af8ec42a78a7a 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -424,10 +424,10 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
     T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
 
-    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
-
     // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
     auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
+
     phi::funcs::FCFunctor<DeviceContext, T> fc;
     fc(dev_ctx,
        total_T,
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
index 7632482c97b3f8d02ca16064cba6eefcad102de3..2db3202b41abbf424e20bf3a0fa842fdd2e97d22 100644
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -86,7 +86,7 @@ class CenterLossKernel : public framework::OpKernel<T> {
     int numel = centers_diffacc.numel();
     std::memset(centers_diffacc_data, 0, sizeof(T) * numel);
 
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     int tLabel;
 
     const T *x_index;
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
index c5b903559a07b60c7f03cc3b36f53c771d2cf0fb..2136bc1933692e6e72a35b1099ce81339e4872d3 100644
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
@@ -37,7 +37,8 @@ class FSPOpKernel : public framework::OpKernel<T> {
     auto height = x_dims[2];
     auto width = x_dims[3];
 
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
     phi::funcs::MatDescriptor x_mat_desc;
     x_mat_desc.height_ = x_channel;
@@ -81,7 +82,8 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
     int64_t h = 0;
     int64_t w = 0;
 
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     phi::funcs::SetConstant<DeviceContext, T> set_zero;
     if (d_x != nullptr) {
       d_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index bec18220e9afdb949769e2dd7f025409b1f6857c..07207b6e028696a81bdb17b4774bb5e9be18768a 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -411,7 +411,8 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     T* xx_data = xx->mutable_data<T>(place);
     T* h_out_data = hidden_out->mutable_data<T>(place);
     T* c_out_data = cell_out->mutable_data<T>(place);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
     for (int64_t i = 0; i < ids_numel; ++i) {
       PADDLE_ENFORCE_LT(
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 7fdfe706ef638774d8964d31a68d9d08b6e1a233..8c86f9d5f471fde6d72a1ce08fa688488ca0848e 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -197,7 +197,9 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
       const int m = batch_size * idx_width;
       const int n = table_width;
       const int k = table_height;
-      auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
+
+      auto &dev_ctx = context.template device_context<phi::CPUContext>();
+      auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
       blas.CSRMM(&transa,
                  &m,
                  &n,
@@ -316,7 +318,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
                           padding_idx);
 
       auto *d_output_data = d_output->data<T>();
-      auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
+      auto &dev_ctx = context.template device_context<phi::CPUContext>();
+      auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
       int width = static_cast<int>(table_dim[1]);
       int num_seq = batch_size * idx_width;
       LOG(INFO) << "num seq = " << num_seq << " width = " << width;
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 932a777d3e1ae0d96f6ad34c6fdbd7ab193848fc..55f60c500b97dc29331cb293a31c61f557227db0 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -310,9 +310,10 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     const T* h0_data = h0 ? h0->data<T>() : nullptr;
     const T* wh_state_data = wh_data + D * D2;
     T* hidden_out_data = hidden_out->mutable_data<T>(place);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
+
     phi::funcs::FCFunctor<DeviceContext, T> fc;
     fc(dev_ctx,
        total_T,
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index a1a0d490dd756ecd96fb68f0c708326fc9e50e5f..112a65a24eadd01b32739d25d2c38fb437d0532a 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -377,9 +377,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     T* xx_data = xx->mutable_data<T>(place);
     T* h_out_data = hidden_out->mutable_data<T>(place);
     T* c_out_data = cell_out->mutable_data<T>(place);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
+
     phi::funcs::FCFunctor<DeviceContext, T> fc;
     fc(dev_ctx, total_T, D4, M, x_data, wx_data, xx_data, bias->data<T>());
 
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 86eb7053f88e89a9996af6d844641294df1b638f..084692768367827273bded882622a93636cc0f01 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -239,9 +239,9 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
     T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
 
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
     auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
+
     phi::funcs::FCFunctor<DeviceContext, T> fc;
     fc(dev_ctx,
        total_T,
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index d46e6cf429f6fcf1f1cb4cbb070736c0ab810fca..c22a82c5ae8fd62bb080780f9810cd9ef03a5531 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -89,7 +89,8 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     const T* weight_data = weight->data<T>();
     T* gate_data = gate->data<T>();
     T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     blas.GEMM(false,
               false,
               batch_size,
@@ -251,7 +252,8 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
                      d_h * u);
     }
     // backward for reset_hidden_prev
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     blas.GEMM(false,
               true,
               batch_size,
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index ad1542666fd39d8c853ef8e941974386c53d4bd3..b13d83a57ee97473d853be3b4e37d60ffbe659c9 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -119,7 +119,8 @@ struct IndexSelectAdd<
                   const T* src_pointer,
                   const T* p_pointer,
                   T* dist_pointer) {
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 04153eecc392779bb17ff78f706de854195a44f8..b467428eeafd3e41405473d91cfc6b3c655dac6d 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -114,7 +114,9 @@ class LookupTableKernel : public framework::OpKernel<T> {
                        table + id_index * row_width,
                        row_width * sizeof(T));
               } else {
-                auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
+                auto &dev_ctx =
+                    context.template device_context<phi::CPUContext>();
+                auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
                 blas.VCOPY(row_width,
                            table + id_index * row_width,
                            output + i * row_width);
@@ -145,7 +147,9 @@ class LookupTableKernel : public framework::OpKernel<T> {
                      table + id_index * row_width,
                      row_width * sizeof(T));
             } else {
-              auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
+              auto &dev_ctx =
+                  context.template device_context<phi::CPUContext>();
+              auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
               blas.VCOPY(row_width,
                          table + id_index * row_width,
                          output + i * row_width);
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index f43fccb19e0b6f0d58dd75ba9eb108710164bff9..52c93f26b7e8a88603176368ea3a2b55819e3935 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -130,7 +130,8 @@ struct LookupTableV2CPUFunctor {
                    table + id_index * row_width,
                    row_width * sizeof(T));
           } else {
-            auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context_);
+            auto &dev_ctx = context_.template device_context<phi::CPUContext>();
+            auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
             blas.VCOPY(row_width,
                        table + id_index * row_width,
                        output + i * row_width);
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 5a6ed730477a1845ae7674c4de4aa0ed3082533e..96d5a115991b01f0aab214423ceeb0985b40e01f 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -45,9 +45,9 @@ struct LRNFunctor<phi::CPUContext, T> {
                   T beta,
                   const DataLayout data_layout) {
     auto place = ctx.GetPlace();
-    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
-    phi::funcs::Transpose<phi::CPUContext, T, 4> transpose;
     auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
+    phi::funcs::Transpose<phi::CPUContext, T, 4> transpose;
     phi::DenseTensor in_transpose, mid_transpose, out_transpose;
     // if channel_last, transpose to channel_first
     if (data_layout == DataLayout::kNHWC) {
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 3473a051b7324c36fec8b72b4316c835e892f57a..773d9f223f83442c0293ae89607ebf013a51780e 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -275,7 +275,8 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
     memset(
         bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
 
-    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
 
     call_gemm(blas,
               CblasNoTrans,
@@ -297,7 +298,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
         const auto* l_t_data =
             bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
         const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-        auto blas_2 = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+        auto blas_2 = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
         call_gemm_with_lda(blas_2,
                            CblasNoTrans,
                            CblasTrans,
@@ -390,7 +391,8 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
 
     auto* t_data = w->data<T>();
     auto* d_w = ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 770e33859a9c99c9a04d16747e7defa00ef146a8..b507f096082f943d652388f61e5cb771eadf3dbc 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -15,13 +15,18 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
 #include "gtest/gtest.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 TEST(selected_rows_functor, gpu_add) {
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CPUPlace cpu_place;
+  phi::GPUPlace gpu_place(0);
+  phi::CPUPlace cpu_place;
   phi::GPUContext& ctx = *reinterpret_cast<phi::GPUContext*>(
-      paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+      phi::DeviceContextPool::Instance().Get(gpu_place));
   phi::funcs::SetConstant<phi::GPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
@@ -37,12 +42,12 @@ TEST(selected_rows_functor, gpu_add) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_EQ(hipDeviceSynchronize(),
                     0,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "The all synchronization on the cuda is error!"));
 #else
   PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(),
                     0,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "The all synchronization on the cuda is error!"));
 #endif
 
@@ -80,8 +85,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[6], 9);
 
   phi::DenseTensor out_cpu;
-  paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
-  ctx.Wait();
+  phi::Copy(ctx, *out_value, cpu_place, true, &out_cpu);
 
   auto* out_cpu_data = out_cpu.data<float>();
   // input1 value
@@ -107,8 +111,7 @@ TEST(selected_rows_functor, gpu_add) {
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   phi::DenseTensor tensor2_cpu;
-  paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
-  ctx.Wait();
+  phi::Copy(ctx, *tensor2, cpu_place, true, &tensor2_cpu);
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
   // row0: 1.0 + 2.0 + 3.0
@@ -128,10 +131,10 @@ TEST(selected_rows_functor, gpu_add) {
 }
 
 TEST(selected_rows_functor, gpu_add_to) {
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CPUPlace cpu_place;
+  phi::GPUPlace gpu_place(0);
+  phi::CPUPlace cpu_place;
   phi::GPUContext& ctx = *reinterpret_cast<phi::GPUContext*>(
-      paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+      phi::DeviceContextPool::Instance().Get(gpu_place));
   phi::funcs::SetConstant<phi::GPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
@@ -181,8 +184,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[6], 9);
 
   phi::DenseTensor out_cpu;
-  paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
-  ctx.Wait();
+  phi::Copy(ctx, *out_value, cpu_place, true, &out_cpu);
 
   auto* out_cpu_data = out_cpu.data<float>();
   // input1 value
@@ -206,8 +208,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   phi::DenseTensor tensor1_cpu;
-  paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
-  ctx.Wait();
+  phi::Copy(ctx, *tensor1, cpu_place, true, &tensor1_cpu);
 
   auto* tensor1_cpu_data = tensor1_cpu.data<float>();
   // row0: 1.0 + 2.0 + 3.0
@@ -227,10 +228,10 @@ TEST(selected_rows_functor, gpu_add_to) {
 }
 
 TEST(selected_rows_functor, gpu_merge_add) {
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CPUPlace cpu_place;
+  phi::GPUPlace gpu_place(0);
+  phi::CPUPlace cpu_place;
   phi::GPUContext& ctx = *reinterpret_cast<phi::GPUContext*>(
-      paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+      phi::DeviceContextPool::Instance().Get(gpu_place));
   phi::funcs::SetConstant<phi::GPUContext, float> set_const;
 
   int64_t height = 10;
@@ -264,8 +265,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
   merge_add_functor(ctx, inputs, output.get());
 
   phi::DenseTensor output_cpu;
-  paddle::framework::TensorCopy(output->value(), cpu_place, ctx, &output_cpu);
-  ctx.Wait();
+  phi::Copy(ctx, output->value(), cpu_place, true, &output_cpu);
 
   EXPECT_EQ(output->height(), height);
   EXPECT_EQ(output->value().dims(), phi::make_ddim({3, row_numel}));
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index bc5e5aa6ea1514b3c8fb50acd542179532bc5832..3036cdc5615ade227cc0d59e26b6d5d1be2defa9 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -69,7 +69,7 @@ class MatMulKernel : public framework::OpKernel<T> {
     auto &dev_ctx = context.template device_context<DeviceContext>();
     dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
 
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
         RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
     auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
@@ -237,7 +237,8 @@ class MatMulGradKernel : public framework::OpKernel<T> {
               bool trans_b,
               phi::DenseTensor *out) const {
     out->mutable_data<T>(context.GetPlace());
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
     auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
 
@@ -376,7 +377,8 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
               bool flag,
               phi::DenseTensor *out) const {
     out->mutable_data<T>(context.GetPlace());
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
     auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
     auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
 
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index 15f87803f5ab8e7ccec5de9996a39b0f27384a19..e2156483320107742a3bb30eda221a24346a053c 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -61,7 +61,8 @@ void call_gemm(const framework::ExecutionContext& ctx,
                T* C) {
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
-  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+  auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
   blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
 }
 
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 6358722e94390521a8919c0b69b4a7d867c1684f..86ef05df0e8f3ea83209eac116a8af30602e888d 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -698,7 +698,8 @@ struct DeviceIndependenceTensorOperations {
  private:
   const framework::ExecutionContext& context;
   phi::funcs::BlasT<DeviceContext, T> GetBlas() {
-    return phi::funcs::GetBlas<DeviceContext, T>(context);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    return phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
   }
   platform::ForRange<DeviceContext> GetForRange(int numel) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index b470874f26083682d637a7cb37e171ec7575de66..f60190f00cb5561ac2a2ad2cd1b36735bb382c66 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -326,7 +326,8 @@ class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
     auto* w_data = w->data<T>();
     auto* col_data = col->data<T>();
 
-    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
     for (int b = 0; b < batch; ++b) {
       int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
       if (top_im_size == 0) {
@@ -484,7 +485,8 @@ class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
     int batch = x->lod()[0].size() - 1;
     const auto& top_offset = out->lod()[0];
     const auto& col_offset = col->lod()[0];
-    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx);
     for (int b = 0; b < batch; ++b) {
       int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
       if (top_im_size == 0) {
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index ab8facad4de0519c89506d8e96bcf4ac13d09c59..850a754b25e7fc8a4685a7566aa789ddcd7d83f8 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -92,8 +92,7 @@ cudaDataType_t ToCudaDataType() {
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "DataType %s is unsupported for CUDA.",
-        paddle::experimental::DataTypeToString(
-            paddle::experimental::CppTypeToDataType<T>::Type())));
+        DataTypeToString(paddle::experimental::CppTypeToDataType<T>::Type())));
   }
 }
 
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index ee15408a949934a6d2c9231424bab14932147b53..5744c3b85dcaf8d41e054f26d14e90cf61d5b351 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -18,6 +18,7 @@
 #include "paddle/phi/core/enforce.h"
 #if defined(PADDLE_WITH_XPU)
 #include "paddle/phi/backends/xpu/xpu_op_list.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #endif
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -134,8 +135,7 @@ bool KernelFactory::HasKernel(const std::string& kernel_name,
 }
 
 void KernelFactory::AddToLowPrecisionKernelList(
-    const std::string& name,
-    const paddle::experimental::DataType& kernel_key_type) {
+    const std::string& name, const phi::DataType& kernel_key_type) {
   if (FLAGS_low_precision_op_list >= 1) {
     auto op_name = phi::TransToFluidOpName(name);
     if (op_name.find("_grad") != std::string::npos) {
@@ -469,14 +469,13 @@ std::string KernelSelectionErrorMessage(const std::string& kernel_name,
       if (kernel_key.dtype() == target_key.dtype()) {
         support_dtype = true;
       }
-      dtype_set.insert(
-          paddle::experimental::DataTypeToString(kernel_key.dtype()));
+      dtype_set.insert(DataTypeToString(kernel_key.dtype()));
     }
     backend_set.insert(
         paddle::experimental::BackendToString(kernel_key.backend()));
     all_kernel_key[paddle::experimental::BackendToString(kernel_key.backend()) +
                    ", " + phi::DataLayoutToString(kernel_key.layout())]
-        .push_back(paddle::experimental::DataTypeToString(kernel_key.dtype()));
+        .push_back(DataTypeToString(kernel_key.dtype()));
   }
   // 1. If target_key not supports target backend, output "Selected wrong
   // Backend ..."
@@ -490,8 +489,7 @@ std::string KernelSelectionErrorMessage(const std::string& kernel_name,
   // DataType ..."
   if (!support_dtype) {
     std::string error_message = paddle::string::join_strings(dtype_set, ", ");
-    return "Selected wrong DataType `" +
-           paddle::experimental::DataTypeToString(target_key.dtype()) +
+    return "Selected wrong DataType `" + DataTypeToString(target_key.dtype()) +
            "`. Paddle support following DataTypes: " + error_message + ".";
   }
   // 3. `target_key` is still not supported, output all kernel keys of
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index 8c954b7d0656447e159ef429faa9a2f0f5bf56ed..7aeef696759f735cd0f502293c333b0c573c5336 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -32,8 +32,6 @@
 #include "paddle/utils/small_vector.h"
 namespace phi {
 
-using DataType = paddle::experimental::DataType;
-
 struct OpCount {
   OpCount() {
     fp16_called_ = 0;
@@ -337,9 +335,8 @@ class KernelFactory {
   const KernelArgsDef& GetFirstKernelArgsDef(
       const std::string& kernel_name) const;
 
-  void AddToLowPrecisionKernelList(
-      const std::string& name,
-      const paddle::experimental::DataType& kernel_key_type);
+  void AddToLowPrecisionKernelList(const std::string& name,
+                                   const DataType& kernel_key_type);
 
   std::map<const std::string, OpCount> GetLowPrecisionKernelList();
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index b2edfd5a2aef1ce3a592776abb53a89ef3a859cd..68e7f24cb9dc19b9e6aa012a57227143b773e057 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -141,9 +141,9 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
       phi::errors::InvalidArgument(
           "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
           "received [%s]",
-          phi::DataTypeToString(DataType::INT32),
-          phi::DataTypeToString(DataType::INT64),
-          phi::DataTypeToString(phi::TransToPhiDataType(dtype))));
+          DataTypeToString(DataType::INT32),
+          DataTypeToString(DataType::INT64),
+          DataTypeToString(phi::TransToPhiDataType(dtype))));
 
   if (!config.is_runtime && axis.FromTensor()) {
     std::vector<int64_t> vec;
diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
index 808173e892e2d51bc09b93a3377a97d37af3145e..0ac688f48fa5b440b89fb97d5c2265cc8113a636 100644
--- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -81,9 +81,9 @@ void IndexSampleGradKernel(const Context& ctx,
                     errors::InvalidArgument(
                         "Input(Index) holds the wrong type, it holds %s, but "
                         "desires to be %s or %s",
-                        phi::DataTypeToString(index_type),
-                        phi::DataTypeToString(DataType::INT32),
-                        phi::DataTypeToString(DataType::INT64)));
+                        DataTypeToString(index_type),
+                        DataTypeToString(DataType::INT32),
+                        DataTypeToString(DataType::INT64)));
   if (index_type == DataType::INT32) {
     IndexSampleGradInner<T, Context, int>(ctx, out_grad, index, x_grad);
   } else if (index_type == DataType::INT64) {
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index de373928498982fffa6af679908591fd06a381b3..4ab51161350f29ec9d316b13f32f08a588e04737 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -94,9 +94,9 @@ void IndexSampleKernel(const Context &ctx,
                     errors::InvalidArgument(
                         "Input(Index) holds the wrong type, it holds %s, but "
                         "desires to be %s or %s",
-                        phi::DataTypeToString(index_type),
-                        phi::DataTypeToString(DataType::INT32),
-                        phi::DataTypeToString(DataType::INT64)));
+                        DataTypeToString(index_type),
+                        DataTypeToString(DataType::INT32),
+                        DataTypeToString(DataType::INT64)));
   if (index_type == DataType::INT32) {
     IndexSampleInner<T, Context, int>(ctx, x, index, out);
   } else if (index_type == DataType::INT64) {
diff --git a/paddle/phi/kernels/cpu/matrix_nms_kernel.cc b/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
index 942b7fabf6af56f7431e6e7516c753aaadfafe97..283b4e0ac7d5b5054d7d51811238a372c0b417a1 100644
--- a/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
@@ -15,6 +15,8 @@
 #include "paddle/phi/kernels/matrix_nms_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
index 75875f81beee8bf82a5429b5214344f9317a911d..d97e9b5603797e209d8e643dba5c8d70394e5f69 100644
--- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
 
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
@@ -54,9 +55,9 @@ void RepeatInterleaveWithTensorIndexGradKernel(
                     phi::errors::InvalidArgument(
                         "Input(Repeats) holds the wrong type, it holds %s, but "
                         "desires to be %s or %s",
-                        phi::DataTypeToString(index_type),
-                        phi::DataTypeToString(phi::DataType::INT32),
-                        phi::DataTypeToString(phi::DataType::INT64)));
+                        DataTypeToString(index_type),
+                        DataTypeToString(phi::DataType::INT32),
+                        DataTypeToString(phi::DataType::INT64)));
 
   phi::DeviceContextPool::Instance().Get(repeats_tensor.place());
   if (index_type == phi::DataType::INT32) {
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index a44c24e971a47ae7c8712262a1d5b93bfc2cefe9..9e970cf1b549a902697e968180d5d0d68ec20e6f 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 #ifdef PADDLE_WITH_MKLML
@@ -579,13 +578,6 @@ class BlasT : private Blas<DeviceContext> {
   }
 };
 
-template <typename DeviceContext, typename T>
-inline BlasT<DeviceContext, T> GetBlas(
-    const paddle::framework::ExecutionContext& exe_ctx) {
-  return BlasT<DeviceContext, T>(
-      exe_ctx.template device_context<DeviceContext>());
-}
-
 template <typename DeviceContext, typename T>
 inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
   return BlasT<DeviceContext, T>(dev_ctx);
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc
index b909f7e537d7ccedda9f49a40fa5ac6f1aa38eb9..a7e8b1b4fdb5ecb7d27e1e809888f5f36709150a 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
+#include <algorithm>
+#include <map>
+#include <set>
+#include <vector>
+
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/mixed_vector.h"
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.h b/paddle/phi/kernels/funcs/selected_rows_functor.h
index 8fe55678195c67564bf8df946d13a7ded2aa99ee..38e68ee0ccfc613a6af7891131bc9ad02bb5f28d 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.h
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/mixed_vector.h"
+#include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h
index 913ee1afb9f4c7c0430b045658c045ccde43b63b..d704d2d60fa8d3745afeb242d26f8ffed491c827 100644
--- a/paddle/phi/kernels/funcs/unique_functor.h
+++ b/paddle/phi/kernels/funcs/unique_functor.h
@@ -84,9 +84,9 @@ struct UniqueOpFunctor {
                         phi::errors::InvalidArgument(
                             "Index holds the wrong type, it holds %s, "
                             "but desires to be %s or %s",
-                            phi::DataTypeToString(index_type),
-                            phi::DataTypeToString(DataType::INT32),
-                            phi::DataTypeToString(DataType::INT64)));
+                            DataTypeToString(index_type),
+                            DataTypeToString(DataType::INT32),
+                            DataTypeToString(DataType::INT64)));
 
       if (index_type == DataType::INT32) {
         for (auto i = 0; i < in_->numel(); ++i) {
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
index 9bf5181c1746ca303fe7a55fbd55d3fa0344203e..ee6602f0dacc2e916e40e2412ae1026733750aba 100755
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -75,9 +75,9 @@ void IndexSampleGradKernel(const Context& ctx,
                     errors::InvalidArgument(
                         "Input(Index) holds the wrong type, it holds %s, but "
                         "desires to be %s or %s",
-                        phi::DataTypeToString(index_type),
-                        phi::DataTypeToString(DataType::INT32),
-                        phi::DataTypeToString(DataType::INT64)));
+                        DataTypeToString(index_type),
+                        DataTypeToString(DataType::INT32),
+                        DataTypeToString(DataType::INT64)));
 
   auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
   auto input_num = x.numel();
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index 2ea5eaa9eaad1ca937a67d27483f7fdb6d4d363a..9175f783b8baf2bb9951b364dc858aefb932d658 100755
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -64,9 +64,9 @@ void IndexSampleKernel(const Context& ctx,
                     errors::InvalidArgument(
                         "Input(Index) holds the wrong type, it holds %s, but "
                         "desires to be %s or %s",
-                        phi::DataTypeToString(index_type),
-                        phi::DataTypeToString(DataType::INT32),
-                        phi::DataTypeToString(DataType::INT64)));
+                        DataTypeToString(index_type),
+                        DataTypeToString(DataType::INT32),
+                        DataTypeToString(DataType::INT64)));
   const T* in_data = x.data<T>();
   T* out_data = ctx.template Alloc<T>(out);
   auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
diff --git a/paddle/phi/kernels/impl/lamb_kernel_impl.h b/paddle/phi/kernels/impl/lamb_kernel_impl.h
index d3a652c82dddb7caae8c547111f58d4e759d43fc..e4d81008fdecb2ceb9756989a6da7f5427409c58 100644
--- a/paddle/phi/kernels/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/lamb_functors.h"
 
@@ -255,8 +256,8 @@ void ComputeImpl(const Context& dev_ctx,
     auto pn = phi::funcs::ToVector(p_norm_ptr, 1, dev_ctx.GetPlace());
     auto tn =
         phi::funcs::ToVector(trust_ratio_div_norm_ptr, 1, dev_ctx.GetPlace());
-    auto dtype = paddle::framework::DataTypeToString(
-        paddle::framework::DataTypeTrait<T>::DataType());
+    auto dtype =
+        DataTypeToString(paddle::experimental::CppTypeToDataType<T>::Type());
     VLOG(1) << "Param " << dtype << " " << name << " pn = " << pn[0]
             << " , tn = " << tn[0];
   }
diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
index 039b056200fddfc239f724634d0307d8a793ad76..dfb3e04a8c33a26523a60374c4154d8ed220de55 100644
--- a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
+++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -224,7 +224,6 @@ void MultiDotKernel(const Context& ctx,
       phi::DDim tmp_dim = phi::make_ddim({Ka, Nc});
       tmp_out.Resize(tmp_dim);
       ctx.template Alloc<T>(&tmp_out);
-      std::cout << tmp_out << std::endl;
       blas.MatMul(
           *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0));
       auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index feb10d08d4c6c58fe72c45295e557fbd2e3831a8..806e2be66332cb0b9052de97abdf43d9c6dab791 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
@@ -91,22 +92,18 @@ void RepeatInterleaveWithTensorIndexGradKernel(
                         repeats_tensor.dims()[0],
                         x_grad->dims()[dim]));
 
-  const auto& index_type =
-      paddle::framework::TransToProtoVarType(repeats_tensor.dtype());
+  const auto& index_type = repeats_tensor.dtype();
 
   bool index_type_match =
-      index_type == paddle::framework::proto::VarType::INT32 ||
-      index_type == paddle::framework::proto::VarType::INT64;
+      index_type == DataType::INT32 || index_type == DataType::INT64;
   PADDLE_ENFORCE_EQ(index_type_match,
                     true,
                     phi::errors::InvalidArgument(
                         "Input(Repeats) holds the wrong type, it holds %s, but "
                         "desires to be %s or %s",
-                        paddle::framework::DataTypeToString(index_type),
-                        paddle::framework::DataTypeToString(
-                            paddle::framework::proto::VarType::INT32),
-                        paddle::framework::DataTypeToString(
-                            paddle::framework::proto::VarType::INT64)));
+                        DataTypeToString(index_type),
+                        DataTypeToString(DataType::INT32),
+                        DataTypeToString(DataType::INT64)));
 #if defined(__NVCC__) || defined(__HIPCC__)
 
   auto output_dim = out_grad.dims();
@@ -126,7 +123,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
          0,
          stream>>>(in_grad_data, numel);
 
-  if (index_type == paddle::framework::proto::VarType::INT64) {
+  if (index_type == DataType::INT64) {
     phi::funcs::RepeatsTensor2IndexTensor<Context, int64_t>(
         ctx, repeats_tensor, &index);
     int64_t index_nums = index.numel();
diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
index ff413c7b61a292f6ad950123c456a2fc14d74fce..b6050810640083bcec81c4d894f094c5335f2150 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
@@ -140,9 +140,9 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
       phi::errors::InvalidArgument(
           "Input(RepeatsTensor) holds the wrong type, it holds %s, but "
           "desires to be %s or %s",
-          phi::DataTypeToString(index_type),
-          phi::DataTypeToString(phi::DataType::INT32),
-          phi::DataTypeToString(phi::DataType::INT64)));
+          DataTypeToString(index_type),
+          DataTypeToString(phi::DataType::INT32),
+          DataTypeToString(phi::DataType::INT64)));
   if (place == cpu_place) {
     auto x_copy = x;
     if (index_type == phi::DataType::INT32) {
diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
index b9bc98425c9bf5326c27fcbdeba8f00800e53ba2..f99c10d8ad8ba2b2d446bbe38c95fb7f2fd70983 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -21,6 +21,8 @@
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
+DECLARE_int32(inner_op_parallelism);
+
 namespace phi {
 namespace sr {
 
diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
index 9a4fd216cd52851b8d62b09df10c853ccd159fa1..4bb0352528e4e4a596333b7e28d6e1bcf076392b 100644
--- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h"
 
+#include <set>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/mixed_vector.h"
diff --git a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
index 84772de939b9d7ffb7e512e2bad9404737dc8c3d..c27a4269b83db4710b1e8161d2043bad6974935e 100644
--- a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/kernels/funcs/lamb_functors.h"
@@ -309,8 +310,8 @@ void ComputeRowImpl(const Context& dev_ctx,
     auto pn = phi::funcs::ToVector(p_norm_ptr, 1, dev_ctx.GetPlace());
     auto tn =
         phi::funcs::ToVector(trust_ratio_div_norm_ptr, 1, dev_ctx.GetPlace());
-    auto dtype = paddle::framework::DataTypeToString(
-        paddle::framework::DataTypeTrait<T>::DataType());
+    auto dtype =
+        DataTypeToString(paddle::experimental::CppTypeToDataType<T>::Type());
     VLOG(1) << "Param " << dtype << " " << name << " pn = " << pn[0]
             << " , tn = " << tn[0];
   }
diff --git a/paddle/phi/kernels/xpu/index_sample_kernel.cc b/paddle/phi/kernels/xpu/index_sample_kernel.cc
index c0cf4587959fac5f368eeda95c2264f9f7bf5032..b2de915dc42fe1044452f6feadb393a3357e544f 100644
--- a/paddle/phi/kernels/xpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_sample_kernel.cc
@@ -32,9 +32,9 @@ void IndexSampleKernel(const Context& ctx,
                     errors::InvalidArgument(
                         "Input(Index) holds the wrong type, it holds %s, but "
                         "desires to be %s or %s",
-                        phi::DataTypeToString(index_type),
-                        phi::DataTypeToString(DataType::INT32),
-                        phi::DataTypeToString(DataType::INT64)));
+                        DataTypeToString(index_type),
+                        DataTypeToString(DataType::INT32),
+                        DataTypeToString(DataType::INT64)));
 
   using XPUType = typename XPUTypeTrait<T>::Type;
 
diff --git a/paddle/phi/tests/kernels/test_math_function.cc b/paddle/phi/tests/kernels/test_math_function.cc
index 3c864c91bd6523731e3bfae41e0e5460b6c7911d..094dfccb5a385e6f07c74f858a5613a1f91604ae 100644
--- a/paddle/phi/tests/kernels/test_math_function.cc
+++ b/paddle/phi/tests/kernels/test_math_function.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <set>
+
 #include "gtest/gtest.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/tests/kernels/test_math_function.cu b/paddle/phi/tests/kernels/test_math_function.cu
index 74f3776990b65748297620802ac0dfbf359fcd30..d28e8b332ef3cebcf6b8dbf08f927ad444d6be0c 100644
--- a/paddle/phi/tests/kernels/test_math_function.cu
+++ b/paddle/phi/tests/kernels/test_math_function.cu
@@ -14,6 +14,7 @@
 
 #include "gtest/gtest.h"
 #include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"