[phi::DenseTensor] Replace Tensor with phi::DenseTensor (#48682)

65420271 · 张春乔 · GitHub · 693de9f0 · 65420271 · 65420271
419 changed file
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -644,11 +644,11 @@ void GradientAccumulator::CallGradientHooks() {
      true,
      platform::errors::PreconditionNotMet(
          "Only can call gradient hooks after sum gradient completed."));
-  PADDLE_ENFORCE_EQ(
-      HasInnerVar(),
+  PADDLE_ENFORCE_EQ(HasInnerVar(),
                    true,
                    platform::errors::PreconditionNotMet(
-          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+                        "Leaf Tensor's inner var is nullptr when "
+                        "call gradient hook."));
  PADDLE_ENFORCE_EQ(
      inner_var_->Var().IsInitialized(),
      true,

--- a/paddle/fluid/operators/abs_op_mlu.cc
+++ b/paddle/fluid/operators/abs_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class AbsMLUKernel : public framework::OpKernel<T> {
 public:
@@ -54,7 +52,7 @@ class AbsGradMLUKernel : public framework::OpKernel<T> {
    MLUCnnlOpTensorDesc mul_op_desc(
        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);

-    Tensor sign_x;
+    phi::DenseTensor sign_x;
    sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());

    MLUCnnl::Sign(ctx,

--- a/paddle/fluid/operators/abs_op_npu.cc
+++ b/paddle/fluid/operators/abs_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AbsNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -21,8 +21,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <cnnlActivationMode_t act_mode, typename T>
 class ActivationMLUKernel : public framework::OpKernel<T> {
 public:
@@ -442,7 +440,7 @@ class ReciprocalGradMLUKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
    auto place = ctx.GetPlace();
    dx->mutable_data<T>(place);
-    Tensor square_out;
+    phi::DenseTensor square_out;
    square_out.Resize(out->dims());
    square_out.mutable_data<T>(place);
    MLUCnnlTensorDesc out_desc(*out);

--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class AffineGridOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AllocFloatStatusKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
  using MPDType = typename details::MPTypeTrait<T>::Type;
@@ -45,7 +43,7 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
      out->mutable_data<T>(ctx.GetPlace());

      // check is_finite or is_nan
-      Tensor is_finite(found_inf->type());
+      phi::DenseTensor is_finite(found_inf->type());
      if (i != 0) {
        is_finite.Resize(phi::make_ddim({1}));
        is_finite.mutable_data<bool>(ctx.GetPlace());
@@ -78,8 +76,8 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
      // out = in/scale, if found_inf = false
      // But when found_inf is true, the data of Out should not be used.
      // So, on MLU, we always compute out with in/scale.
-      Tensor float_x;
-      Tensor float_out;
+      phi::DenseTensor float_x;
+      phi::DenseTensor float_out;
      if (std::is_same<T, paddle::platform::float16>::value) {
        float_x.Resize(x->dims());
        float_out.Resize(out->dims());

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 // NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
 // On NPU, we do not really check the data of input tensors,
 // but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
@@ -47,13 +45,13 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
            .stream();

    // step1: inverse scale
-    Tensor const_tensor;
+    phi::DenseTensor const_tensor;
    const_tensor.mutable_data<T>({1}, ctx.GetPlace());
    FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));

    // Inverse(1.0/scale)
    phi::DenseTensor* tmp_inverse_out = const_cast<phi::DenseTensor*>(scale);
-    Tensor inverse_out(scale->type());
+    phi::DenseTensor inverse_out(scale->type());
    inverse_out.Resize(scale->dims());
    inverse_out.mutable_data<T>(ctx.GetPlace());
    const auto& runner_inverse =
@@ -62,7 +60,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
    tmp_inverse_out = &inverse_out;

    // NOTE(zhiqiu):
-    Tensor tmp;
+    phi::DenseTensor tmp;
    tmp.mutable_data<float>({8}, ctx.GetPlace());
    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
    // tmp is only placeholder.
@@ -73,7 +71,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
                    {{"message", std::string("check_nan_and_inf")}});
    runner_float_status.Run(stream);

-    Tensor sum;
+    phi::DenseTensor sum;
    sum.mutable_data<float>({1}, ctx.GetPlace());
    const auto& runner_reduce_sum =
        NpuOpRunner("ReduceSumD",

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -31,8 +31,6 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;

-using Tensor = phi::DenseTensor;
-
 USE_OP_ITSELF(check_finite_and_unscale);
 USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);

@@ -110,7 +108,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
  ctx.Wait();

  // out found_inf
-  Tensor found_inf_tensor;
+  phi::DenseTensor found_inf_tensor;
  found_inf_tensor.Resize({1});
  bool *found_inf_data =
      found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());

--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ClearFloatStatusKernel : public framework::OpKernel<T> {
 public:
@@ -35,7 +33,7 @@ class ClearFloatStatusKernel : public framework::OpKernel<T> {
                      platform::errors::PreconditionNotMet(
                          "The input(FloatStatus) and Output(FloatStatusOut) "
                          "should be the same."));
-    Tensor tmp;
+    phi::DenseTensor tmp;
    tmp.mutable_data<float>({8}, ctx.GetPlace());
    const auto& runner =
        NpuOpRunner("NPUClearFloatStatus", {tmp}, {*float_status_out});

--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class GetFloatStatusKernel : public framework::OpKernel<T> {
 public:
@@ -35,7 +33,7 @@ class GetFloatStatusKernel : public framework::OpKernel<T> {
                      platform::errors::PreconditionNotMet(
                          "The input(FloatStatus) and Output(FloatStatusOut) "
                          "should be the same."));
-    Tensor tmp;
+    phi::DenseTensor tmp;
    tmp.mutable_data<float>({8}, ctx.GetPlace());
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()

--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -25,8 +25,6 @@ DECLARE_int32(min_loss_scaling);
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void Update(const platform::NPUDeviceContext& ctx,
            const std::vector<bool> found_inf_vec,
@@ -50,7 +48,7 @@ void Update(const platform::NPUDeviceContext& ctx,
                             good_out_tensor->numel() * sizeof(int),
                             stream);
    // bad_out_data = bad_in_data + 1
-    Tensor factor_tensor(bad_out_tensor->dtype());
+    phi::DenseTensor factor_tensor(bad_out_tensor->dtype());
    factor_tensor.mutable_data<int>({1}, place);
    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
    const auto& runner_p2 = NpuOpRunner(
@@ -106,7 +104,7 @@ void Update(const platform::NPUDeviceContext& ctx,
                             stream);

    // good_out_data = good_in_data + 1
-    Tensor factor_tensor(good_out_tensor->dtype());
+    phi::DenseTensor factor_tensor(good_out_tensor->dtype());
    factor_tensor.mutable_data<int>({1}, place);
    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
    const auto& runner_p2 = NpuOpRunner(

--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;

 template <typename T>
@@ -36,7 +35,7 @@ struct VisitDataArgNPUMaxFunctor {
    auto dtype = ctx.Attr<int>("dtype");
    const bool& flatten = ctx.Attr<bool>("flatten");

-    Tensor transformed_x(x.type());
+    phi::DenseTensor transformed_x(x.type());
    transformed_x.ShareDataWith(x);
    if (flatten) {
      transformed_x.Resize(phi::make_ddim({x.numel()}));

--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */

 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;

 template <typename DeviceContext, typename T>
 class ArgMinNPUKernel : public framework::OpKernel<T> {

--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;

 template <typename T>
@@ -79,16 +78,16 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
    framework::NPUAttributeMap attr = {{"axis", -1},
                                       {"descending", descending}};

-    Tensor indices_tmp(experimental::DataType::INT32);
+    phi::DenseTensor indices_tmp(experimental::DataType::INT32);
    indices_tmp.Resize(indices->dims());

    if (framework::TransToProtoVarType(input->dtype()) ==
        framework::proto::VarType::INT64) {
-      Tensor input_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor input_fp32(experimental::DataType::FLOAT32);
      input_fp32.Resize(input->dims());
      CastToFP32(ctx, stream, *input, &input_fp32);

-      Tensor output_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor output_fp32(experimental::DataType::FLOAT32);
      output_fp32.Resize(output->dims());

      if (axis == -1 || axis + 1 == in_dims.size()) {
@@ -112,12 +111,12 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
        }
        auto trans_dims = phi::make_ddim(shape);

-        Tensor trans_input(input_fp32.type());
+        phi::DenseTensor trans_input(input_fp32.type());
        trans_input.Resize(trans_dims);
        TranposeNPU<float>(ctx, stream, &perm, input_fp32, &trans_input);

-        Tensor trans_output(input_fp32.type());
-        Tensor trans_indices(experimental::DataType::INT32);
+        phi::DenseTensor trans_output(input_fp32.type());
+        phi::DenseTensor trans_indices(experimental::DataType::INT32);
        trans_output.mutable_data<float>(trans_dims, ctx.GetPlace());
        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());

@@ -150,12 +149,12 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
        }
        auto trans_dims = phi::make_ddim(shape);

-        Tensor trans_input(input->type());
+        phi::DenseTensor trans_input(input->type());
        trans_input.Resize(trans_dims);
        TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);

-        Tensor trans_output(input->type());
-        Tensor trans_indices(experimental::DataType::INT32);
+        phi::DenseTensor trans_output(input->type());
+        phi::DenseTensor trans_indices(experimental::DataType::INT32);
        trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());

@@ -183,12 +182,12 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
      phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
  const int64_t input_width = in_dims[in_dims.size() - 1];

-  Tensor input_tmp;
+  phi::DenseTensor input_tmp;
  input_tmp.ShareDataWith(input);
  input_tmp.Resize(
      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));

-  Tensor indices_tmp;
+  phi::DenseTensor indices_tmp;
  indices_tmp.ShareDataWith(indices);
  indices_tmp.Resize(
      phi::make_ddim(std::vector<int64_t>{input_height, input_width}));
@@ -197,12 +196,12 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
  for (Type i = 0; i < input_height; i++) {
    indexs_value.push_back(i * input_width);
  }
-  Tensor indexs_tmp(indices.type());
+  phi::DenseTensor indexs_tmp(indices.type());
  framework::TensorFromVector<int64_t>(
      indexs_value, ctx.device_context(), &indexs_tmp);
  indexs_tmp.Resize(phi::make_ddim(std::vector<int64_t>{input_height, 1}));

-  Tensor indices_index(indices.type());
+  phi::DenseTensor indices_index(indices.type());
  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
  const auto& runner_add =
      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
@@ -212,7 +211,7 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));

  t_out->mutable_data<T>(ctx.GetPlace());
-  Tensor out_tmp(t_out->type());
+  phi::DenseTensor out_tmp(t_out->type());
  out_tmp.ShareDataWith(*t_out);

  const auto& runner = NpuOpRunner("TensorScatterUpdate",
@@ -252,15 +251,15 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
      }
      auto trans_dims = phi::make_ddim(shape);

-      Tensor trans_dout(dO->type());
-      Tensor trans_ids(indices->type());
+      phi::DenseTensor trans_dout(dO->type());
+      phi::DenseTensor trans_ids(indices->type());
      trans_dout.Resize(trans_dims);
      trans_ids.Resize(trans_dims);

      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);

-      Tensor trans_dx(dO->type());
+      phi::DenseTensor trans_dx(dO->type());
      trans_dx.Resize(trans_dims);
      FullAssignNPU<T, int64_t>(
          ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx);

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -212,39 +212,41 @@ void AttentionLSTMOpMaker::Make() {
      "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
      "total time steps in this mini-batch, M is the dim size of x.");
  AddInput("C0",
-           "(Tensor) LSTM C0"
+           "(phi::DenseTensor) LSTM C0"
           "This is a tensor with shape (N x D), where N is the batch size, D "
           "is the gate size."
           "C0 is necessary because of attention.");
  AddInput("H0",
-           "(Tensor, optional) LSTM H0"
+           "(phi::DenseTensor, optional) LSTM H0"
           "This is a tensor with shape (N x D), where N is the "
           "batch size and D is the gate size.")
      .AsDispensable();
  AddInput("AttentionWeight",
-           "(Tensor) the weights of attention fc. Always relu the fc result."
+           "(phi::DenseTensor) the weights of attention fc. Always relu the fc "
+           "result."
           "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
           "gate size of LSTM.");
  AddInput("AttentionBias",
-           "(Tensor, optional) the bias of attention fc."
+           "(phi::DenseTensor, optional) the bias of attention fc."
           "The shape is (1 x 1)")
      .AsDispensable();
  AddInput("AttentionScalar",
-           "(Tensor, optional) the scalar on the result of attentioned fc. "
+           "(phi::DenseTensor, optional) the scalar on the result of "
+           "attentioned fc. "
           "Always relu the Scalar."
           "The shape is (1 x 1)")
      .AsDispensable();
  AddInput("AttentionScalarBias",
-           "(Tensor, optional) the scalar bias of attention fc."
+           "(phi::DenseTensor, optional) the scalar bias of attention fc."
           "The shape is (1 x 1)")
      .AsDispensable();
  AddInput("LSTMWeight",
-           "(Tensor) the combined weight of LSTM"
+           "(phi::DenseTensor) the combined weight of LSTM"
           " - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
           "is the dim size of x"
           " - Weight = {W_forget, W_input, W_output, W_cell}");
  AddInput("LSTMBias",
-           "(Tensor) the combined bias of LSTM, shape (1x4D)."
+           "(phi::DenseTensor) the combined bias of LSTM, shape (1x4D)."
           "Note: we should add the bias of hidden and context accorindg to "
           "the same gate: "
           "{B_forget, B_input, B_output, B_cell}");
@@ -257,20 +259,21 @@ void AttentionLSTMOpMaker::Make() {
      "(phi::DenseTensor) (same as LSTMOp) the cell state of LSTM operator. "
      "The shape is (T x D), and lod is the same with the `Input`.");
  AddOutput("AttentionedX",
-            "(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
+            "(phi::DenseTensor) shape is (T x 1), the result after X * "
+            "AttentionWeight,"
            " where T is the total time steps in this mini-batch,"
            " D is the hidden size.")
      .AsIntermediate();
  AddOutput("AttentionFCOut",
-            "(Tensor) (max_seq_len, 1), compute at each step.")
+            "(phi::DenseTensor) (max_seq_len, 1), compute at each step.")
      .AsIntermediate();
  AddOutput("LSTMX",
-            "(Tensor) the input X of LSTM for each step."
+            "(phi::DenseTensor) the input X of LSTM for each step."
            "Shape is (1 x M), where M is the x frame size")
      .AsIntermediate();
-  AddOutput(
-      "LSTMOUT",
-      "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
+  AddOutput("LSTMOUT",
+            "(phi::DenseTensor) the output of LSTM X(1*(D+M))* "
+            "weight((D+M)*4D) for each step."
            "Shape is (1 x 4D), where M is the x frame size")
      .AsIntermediate();
  AddAttr<std::string>("gate_activation",

--- a/paddle/fluid/operators/attention_lstm_op.h
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class AttentionLSTMOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -207,7 +207,7 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(

 framework::OpKernelType BatchNormOp::GetKernelTypeForVar(
    const std::string &var_name,
-    const Tensor &tensor,
+    const phi::DenseTensor &tensor,
    const framework::OpKernelType &expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
  // Only input require reshaping, weights and
@@ -265,7 +265,7 @@ void BatchNormOpMaker::Make() {
           "The global variance (for training) "
           "or estimated Variance (for testing)");
  AddInput("MomentumTensor",
-           "(Tensor<float32>, optional) If provided, batch_norm will "
+           "(phi::DenseTensor<float32>, optional) If provided, batch_norm will "
           "use this as momentum, this has a higher priority than "
           "attr(momentum), the shape of this tensor MUST BE [1].")
      .AsDispensable();
@@ -380,9 +380,9 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
    PADDLE_THROW(
        platform::errors::InvalidArgument("can't find gradient variable of Y"));
  }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
  } else if (var->IsType<phi::DenseTensor>()) {
    t = &var->Get<phi::DenseTensor>();
  }
@@ -397,7 +397,7 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(

 framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar(
    const std::string &var_name,
-    const Tensor &tensor,
+    const phi::DenseTensor &tensor,
    const framework::OpKernelType &expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
  // Only input require reshaping, weights and
@@ -522,9 +522,9 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
    PADDLE_THROW(
        platform::errors::NotFound("cannot find gradient variable of Y"));
  }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
  } else if (var->IsType<phi::DenseTensor>()) {
    t = &var->Get<phi::DenseTensor>();
  }

--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -34,7 +34,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;

--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;

 template <typename T>

--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -78,8 +78,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
    saved_mean->mutable_data<MPDType>(place);
    saved_variance->mutable_data<MPDType>(place);

-    Tensor transformed_x;
-    Tensor transformed_y;
+    phi::DenseTensor transformed_x;
+    phi::DenseTensor transformed_y;
    const int transformed_dim_size = 4;
    const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
    MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
@@ -116,7 +116,7 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {

    if (ctx.HasInput("MomentumTensor")) {
      const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-      Tensor mom_cpu;
+      phi::DenseTensor mom_cpu;
      framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
      momentum = mom_cpu.data<float>()[0];
    }
@@ -226,9 +226,9 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
                                          : x_dims[x_dims.size() - 1]);
    const int sample_size = x->numel() / N / C;

-    Tensor transformed_d_y;
-    Tensor transformed_x;
-    Tensor transformed_d_x;
+    phi::DenseTensor transformed_d_y;
+    phi::DenseTensor transformed_x;
+    phi::DenseTensor transformed_d_x;
    const int transformed_dim_size = 4;
    const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};


--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -89,7 +89,7 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
      // is only used in this training branch
      if (ctx.HasInput("MomentumTensor")) {
        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        Tensor mom_cpu;
+        phi::DenseTensor mom_cpu;
        paddle::framework::TensorCopySync(
            *mom_tensor, platform::CPUPlace(), &mom_cpu);
        momentum = mom_cpu.data<float>()[0];

--- a/paddle/fluid/operators/bce_loss_op_mlu.cc
+++ b/paddle/fluid/operators/bce_loss_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class BCELossMLUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class BCELossNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -40,7 +40,7 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 Cast Operator.

 This Operator casts the input tensor to another data type and
-returns the Output Tensor. It's meaningless if the output dtype equals
+returns the Output phi::DenseTensor. It's meaningless if the output dtype equals
 the input dtype, but it's fine if you do so.

 )DOC");

--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CastMLUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -32,8 +32,6 @@ static std::map<framework::proto::VarType::Type, aclDataType>
        {framework::proto::VarType::FP64, ACL_DOUBLE},
 };

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CastNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 template <typename T,
          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
@@ -81,7 +80,7 @@ class CenterLossKernel : public framework::OpKernel<T> {

    auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());

-    Tensor centers_diffacc;  // used to accumulate all diff
+    phi::DenseTensor centers_diffacc;  // used to accumulate all diff
    auto centers_diffacc_data =
        centers_diffacc.mutable_data<T>(centers_dim, ctx.GetPlace());
    int numel = centers_diffacc.numel();

--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 // using SelectedRows = phi::SelectedRows;
 template <typename T,
          int MajorType = Eigen::RowMajor,

--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class NPUClipByNormKernel : public framework::OpKernel<T> {
 public:
@@ -48,7 +46,7 @@ class NPUClipByNormKernel : public framework::OpKernel<T> {
                                "Input(X) of ClipByNormOp should not be null. "
                                "Please check if it is created correctly."));

-    Tensor square_sum(input->type());
+    phi::DenseTensor square_sum(input->type());
    square_sum.mutable_data<T>(framework::DDim({1}), place);
    const auto& x_dims = input->dims();
    std::vector<int> axis;
@@ -62,12 +60,12 @@ class NPUClipByNormKernel : public framework::OpKernel<T> {
                    {{"axis", axis}, {"keep_dims", false}});
    square_sum_runner.Run(stream);

-    Tensor x_norm(input->type());
+    phi::DenseTensor x_norm(input->type());
    x_norm.mutable_data<T>(framework::DDim({1}), place);
    const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
    x_norm_runner.Run(stream);

-    Tensor x_norm_t;
+    phi::DenseTensor x_norm_t;
    framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
    auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
    if (x_norm_v <= max_norm) {

--- a/paddle/fluid/operators/clip_op_mlu.cc
+++ b/paddle/fluid/operators/clip_op_mlu.cc
@@ -29,7 +29,7 @@ class ClipMLUKernel : public framework::OpKernel<T> {
    auto max = static_cast<T>(ctx.Attr<float>("max"));

    if (ctx.HasInput("Min")) {
-      Tensor min_cpu;
+      phi::DenseTensor min_cpu;
      auto* min_tensor = ctx.Input<phi::DenseTensor>("Min");
      auto* min_data = min_tensor->data<T>();
      if (platform::is_mlu_place(min_tensor->place())) {
@@ -41,7 +41,7 @@ class ClipMLUKernel : public framework::OpKernel<T> {
    }

    if (ctx.HasInput("Max")) {
-      Tensor max_cpu;
+      phi::DenseTensor max_cpu;
      auto* max_tensor = ctx.Input<phi::DenseTensor>("Max");
      auto* max_data = max_tensor->data<T>();
      if (platform::is_mlu_place(max_tensor->place())) {
@@ -80,7 +80,7 @@ class ClipGradMLUKernel : public framework::OpKernel<T> {

    auto min_val = ctx.Attr<float>("min");
    if (min_tensor) {
-      Tensor min_data;
+      phi::DenseTensor min_data;
      framework::TensorCopy(
          *min_tensor,
          platform::CPUPlace(),
@@ -91,7 +91,7 @@ class ClipGradMLUKernel : public framework::OpKernel<T> {
    }
    auto max_val = ctx.Attr<float>("max");
    if (max_tensor) {
-      Tensor max_data;
+      phi::DenseTensor max_data;
      framework::TensorCopy(
          *max_tensor,
          platform::CPUPlace(),

--- a/paddle/fluid/operators/clip_op_npu.cc
+++ b/paddle/fluid/operators/clip_op_npu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ClipNPUKernel : public framework::OpKernel<T> {
 public:
@@ -33,8 +31,8 @@ class ClipNPUKernel : public framework::OpKernel<T> {
    auto max_tensor =
        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;

-    Tensor min_tensor_temp(x->type());
-    Tensor max_tensor_temp(x->type());
+    phi::DenseTensor min_tensor_temp(x->type());
+    phi::DenseTensor max_tensor_temp(x->type());
    if (min_tensor == nullptr) {
      auto min_value = static_cast<T>(ctx.Attr<float>("min"));
      min_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
@@ -74,7 +72,7 @@ class ClipGradNPUKernel : public framework::OpKernel<T> {

    auto min_val = ctx.Attr<float>("min");
    if (min_tensor) {
-      Tensor min_data;
+      phi::DenseTensor min_data;
      framework::TensorCopy(
          *min_tensor,
          platform::CPUPlace(),
@@ -86,7 +84,7 @@ class ClipGradNPUKernel : public framework::OpKernel<T> {

    auto max_val = ctx.Attr<float>("max");
    if (max_tensor) {
-      Tensor max_data;
+      phi::DenseTensor max_data;
      framework::TensorCopy(
          *max_tensor,
          platform::CPUPlace(),

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -61,7 +61,7 @@ struct FillConstantVisitor {
                 * = nullptr) const {
 #ifdef PADDLE_WITH_ASCEND_CL
    if (platform::is_npu_place(dev_ctx_.GetPlace())) {
-      Tensor tensor_tmp(framework::TransToPhiDataType(dtype_));
+      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(dtype_));
      tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
      FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));


--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -151,10 +151,9 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
                        aclrtStream stream,
                        const phi::DenseTensor* in) {
-  using Tensor = phi::DenseTensor;
-  Tensor out(in->type());
+  phi::DenseTensor out(in->type());

-  Tensor mean(in->type());
+  phi::DenseTensor mean(in->type());
  mean.Resize({1});
  mean.mutable_data<float>(dev_ctx.GetPlace());
  std::vector<int> axes;

--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;

@@ -126,7 +124,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
    const int N = phi::funcs::SizeToAxis(axis, logits_dims);
    const int D = phi::funcs::SizeFromAxis(axis, logits_dims);

-    Tensor logits_2d, softmax_2d, loss_2d;
+    phi::DenseTensor logits_2d, softmax_2d, loss_2d;
    logits_2d.ShareDataWith(*logits).Resize({N, D});
    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
    loss_2d.ShareDataWith(*loss).Resize({N, 1});
@@ -135,7 +133,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
    auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);

    // step 1, obtain logit_max
-    Tensor logits_max;
+    phi::DenseTensor logits_max;
    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* logits_max_buff = logits_max.mutable_data<T>(place);

@@ -163,7 +161,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
            .unaryExpr(math::ValueClip<T>());

    // step 3, obtain predict target
-    Tensor predicted_logits;
+    phi::DenseTensor predicted_logits;
    predicted_logits =
        ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    predicted_logits.mutable_data<T>(place);
@@ -215,7 +213,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
    eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();

    // step 5, obtain sum_exp_logits
-    Tensor sum_exp_logits;
+    phi::DenseTensor sum_exp_logits;
    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

@@ -278,7 +276,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    const int N = phi::funcs::SizeToAxis(axis, logits_dims);
    const int D = phi::funcs::SizeFromAxis(axis, logits_dims);

-    Tensor logits_2d, softmax_2d, loss_2d;
+    phi::DenseTensor logits_2d, softmax_2d, loss_2d;
    logits_2d.ShareDataWith(*logits).Resize({N, D});
    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
    loss_2d.ShareDataWith(*loss).Resize({N, 1});
@@ -287,7 +285,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);

    // step 1, obtain logit_max
-    Tensor logits_max;
+    phi::DenseTensor logits_max;
    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);

    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
@@ -309,7 +307,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
            .unaryExpr(math::ValueClip<T>());

    // step 3, obtain predict target
-    Tensor predicted_logits;
+    phi::DenseTensor predicted_logits;
    predicted_logits =
        ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    predicted_logits.mutable_data<T>(place);
@@ -355,7 +353,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();

    // step 5, obtain sum_exp_logits
-    Tensor sum_exp_logits;
+    phi::DenseTensor sum_exp_logits;
    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

@@ -405,7 +403,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
    const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
    const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);

-    Tensor logit_grad_2d;
+    phi::DenseTensor logit_grad_2d;
    logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});

    int blocks = NumBlocks(N * D);

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -26,7 +26,6 @@ limitations under the License. */

 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;

 class ConcatOp : public framework::OperatorWithKernel {
 public:

--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -119,7 +119,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
            out_grad->dims().size()));
    // get output tensor that the name is not kEmptyVarName
    std::vector<void*> outputs_vec;
-    std::vector<Tensor> tmp_outputs_vec;
+    std::vector<phi::DenseTensor> tmp_outputs_vec;
    std::vector<MLUCnnlTensorDesc> output_descs;
    std::vector<cnnlTensorDescriptor_t> descs_vec;
    for (size_t j = 0; j < outs.size(); ++j) {
@@ -129,7 +129,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
        output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
        outputs_vec.push_back(GetBasePtr(outs[j]));
      } else {
-        Tensor tmp_tensor;
+        phi::DenseTensor tmp_tensor;
        tmp_tensor.mutable_data<T>(ins[j]->dims(), ctx.GetPlace());
        tmp_outputs_vec.push_back(tmp_tensor);
        output_descs.emplace_back(MLUCnnlTensorDesc(*ins[j]));

--- a/paddle/fluid/operators/controlflow/logical_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T, cnnlLogicOp_t log_method>
 class LogicalMLUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class LogicalNotNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -29,8 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.
 inline int ConvOutputSize(

--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;

 template <typename T>
@@ -56,8 +55,8 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
    UpdatePaddingAndDilation(
        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);

-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
    if (channel_last) {
      input_tensor.ShareDataWith(*input);
@@ -78,7 +77,7 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
    output_tensor.set_layout(DataLayout::kNHWC);

    // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              filter,
@@ -166,8 +165,8 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
    UpdatePaddingAndDilation(
        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);

-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
    if (channel_last) {
@@ -193,7 +192,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(ctx.GetPlace());

      auto filter_grad_dims = filter_grad->dims();
-      Tensor temp_filter_grad(filter_grad->type());
+      phi::DenseTensor temp_filter_grad(filter_grad->type());
      temp_filter_grad.mutable_data<T>({filter_grad_dims[0],
                                        filter_grad_dims[2],
                                        filter_grad_dims[3],
@@ -234,7 +233,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
    if (input_grad) {
      input_grad->mutable_data<T>(ctx.GetPlace());

-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
      if (channel_last) {
        input_grad_tensor.ShareDataWith(*input_grad);
      } else {
@@ -248,7 +247,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
      input_grad_tensor.set_layout(DataLayout::kNHWC);

      // transpose filter from MCHW to MHWC
-      Tensor trans_filter(filter->type());
+      phi::DenseTensor trans_filter(filter->type());
      TransposeFromMLUTensor<T>(ctx,
                                perm_to_nhwc,
                                filter,
@@ -326,8 +325,8 @@ class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
    UpdatePaddingAndDilation(
        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);

-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
    if (channel_last) {
      groups = in_dims[3];
@@ -350,7 +349,7 @@ class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
    output_tensor.set_layout(DataLayout::kNHWC);

    // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              filter,
@@ -438,8 +437,8 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
    UpdatePaddingAndDilation(
        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);

-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
    const std::vector<int> perm_hwcm_to_mchw = {3, 2, 0, 1};
@@ -469,7 +468,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(ctx.GetPlace());

      auto filter_grad_dims = filter_grad->dims();
-      Tensor temp_filter_grad(filter_grad->type());
+      phi::DenseTensor temp_filter_grad(filter_grad->type());
      // Details about setting diff_w hwcn for better performance, see the CNNL
      // documentation.
      temp_filter_grad.mutable_data<T>({filter_grad_dims[perm_mchw_to_hwcm[0]],
@@ -512,7 +511,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
    if (input_grad) {
      input_grad->mutable_data<T>(ctx.GetPlace());

-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
      if (channel_last) {
        input_grad_tensor.ShareDataWith(*input_grad);
      } else {
@@ -526,7 +525,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
      input_grad_tensor.set_layout(DataLayout::kNHWC);

      // transpose filter from MCHW to MHWC
-      Tensor trans_filter(filter->type());
+      phi::DenseTensor trans_filter(filter->type());
      TransposeFromMLUTensor<T>(ctx,
                                perm_to_nhwc,
                                filter,

--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 static void CastToFP16(const framework::ExecutionContext& ctx,
                       const aclrtStream& stream,
@@ -104,7 +103,7 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
    std::vector<int> strides(4, 1);
    std::vector<int> dilations(4, 1);

-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
    input_tensor.ShareDataWith(*input);
    output_tensor.ShareDataWith(*output);

@@ -125,7 +124,7 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
    auto stream = ctx.template device_context<NPUDeviceContext>().stream();

    // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    Tensor transformed_filter(filter->type());
+    phi::DenseTensor transformed_filter(filter->type());
    transformed_filter.mutable_data<T>({filter->dims()[1],
                                        filter->dims()[0],
                                        filter->dims()[2],
@@ -189,7 +188,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
    auto stream = ctx.template device_context<NPUDeviceContext>().stream();

    // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    Tensor transformed_filter(filter->type());
+    phi::DenseTensor transformed_filter(filter->type());
    transformed_filter.mutable_data<T>({filter->dims()[1],
                                        filter->dims()[0],
                                        filter->dims()[2],
@@ -204,7 +203,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
    std::vector<int> strides(4, 1);
    std::vector<int> dilations(4, 1);

-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
    input_tensor.ShareDataWith(*input);
    output_grad_tensor.ShareDataWith(*output_grad);
    if (channel_last) {
@@ -247,7 +246,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
    }
    if (input_grad) {
      input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
      input_grad_tensor.ShareDataWith(*input_grad);
      if (channel_last) {
        input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -305,7 +304,7 @@ class NPUConvOpKernel : public framework::OpKernel<T> {
    std::vector<int> strides_vec(4, 1);
    std::vector<int> dilations_vec(4, 1);

-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
    input_tensor.ShareDataWith(*input);
    output_tensor.ShareDataWith(*output);
    if (channel_last) {
@@ -378,7 +377,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
    std::vector<int> strides_vec(4, 1);
    std::vector<int> dilations_vec(4, 1);

-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
    input_tensor.ShareDataWith(*input);
    output_grad_tensor.ShareDataWith(*output_grad);
    if (channel_last) {
@@ -400,7 +399,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(ctx.GetPlace());
      std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());

-      Tensor filter_grad_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor filter_grad_fp32(experimental::DataType::FLOAT32);
      filter_grad_fp32.Resize(filter_grad->dims());

      if (framework::TransToProtoVarType(input->dtype()) ==
@@ -430,7 +429,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
      input_grad->mutable_data<T>(ctx.GetPlace());
      std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());

-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
      input_grad_tensor.ShareDataWith(*input_grad);
      if (channel_last) {
        input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -617,8 +616,9 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(ctx.GetPlace());
      std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());

-      Tensor filter_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-          filter_grad->dims(), dev_ctx);
+      phi::DenseTensor filter_grad_tensor =
+          ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter_grad->dims(),
+                                                     dev_ctx);
      filter_grad_tensor.ShareDataWith(*filter_grad);
      filter_grad_tensor.set_layout(DataLayout::kNCDHW);

@@ -638,8 +638,9 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
      input_grad->mutable_data<T>(ctx.GetPlace());
      std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());

-      Tensor input_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-          input_grad->dims(), dev_ctx);
+      phi::DenseTensor input_grad_tensor =
+          ctx.AllocateTmpTensor<T, NPUDeviceContext>(input_grad->dims(),
+                                                     dev_ctx);
      input_grad_tensor.ShareDataWith(*input_grad);
      input_grad_tensor.set_layout(DataLayout::kNCDHW);


--- a/paddle/fluid/operators/conv_transpose_op_mlu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;

 template <typename T>
@@ -61,8 +60,8 @@ class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
    phi::UpdatePaddingAndDilation(
        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);

-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
    input_tensor.set_layout(DataLayout::kNHWC);
    output_tensor.set_layout(DataLayout::kNHWC);
    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
@@ -84,7 +83,7 @@ class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
    }

    // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              filter,
@@ -168,8 +167,8 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
    phi::UpdatePaddingAndDilation(
        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);

-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
    output_grad_tensor.set_layout(DataLayout::kNHWC);

    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
@@ -191,7 +190,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
    }

    // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              filter,
@@ -217,7 +216,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {

    if (filter_grad) {
      filter_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor filter_grad_tensor(filter_grad->type());
+      phi::DenseTensor filter_grad_tensor(filter_grad->type());
      // filter_grad always MCHW
      // filter_grad_tensor always MHWC
      auto filter_grad_dims = filter_grad->dims();
@@ -253,7 +252,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {

    if (input_grad) {
      input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
      input_tensor.set_layout(DataLayout::kNHWC);

      if (channel_last) {

--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;

 template <typename T>
@@ -65,7 +64,7 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
    std::vector<int> strides(4, 1);
    std::vector<int> dilations(4, 1);

-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
    input_tensor.ShareDataWith(*input);
    output_tensor.ShareDataWith(*output);

@@ -148,7 +147,7 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
    std::vector<int> strides_vec(4, 1);
    std::vector<int> dilations_vec(4, 1);

-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
    input_tensor.ShareDataWith(*input);
    output_grad_tensor.ShareDataWith(*output_grad);
    if (channel_last) {
@@ -182,7 +181,7 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
    }
    if (input_grad) {
      input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
      input_grad_tensor.ShareDataWith(*input_grad);
      if (channel_last) {
        input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -248,7 +247,7 @@ class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
    std::vector<int> strides(5, 1);
    std::vector<int> dilations(5, 1);

-    Tensor input_tensor, output_tensor, filter_tensor;
+    phi::DenseTensor input_tensor, output_tensor, filter_tensor;
    input_tensor.Resize(input->dims());
    input_tensor.ShareDataWith(*input);
    output_tensor.Resize(output->dims());

--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -30,8 +30,6 @@ class OpBase;
 }  // namespace imperative
 }  // namespace paddle

-using Tensor = phi::DenseTensor;
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 inline std::vector<int64_t> CorrelationOutputSize(int batch,
                                                  int input_height,
                                                  int input_width,

--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -21,13 +21,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
+    // get phi::DenseTensor
    auto* in_x = context.Input<phi::DenseTensor>("X");
    auto* in_y = context.Input<phi::DenseTensor>("Y");
    auto* out_z = context.Output<phi::DenseTensor>("Out");
@@ -74,7 +72,7 @@ template <typename DeviceContext, typename T>
 class CosSimGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
+    // get phi::DenseTensor
    auto* in_x = context.Input<phi::DenseTensor>("X");
    auto* in_y = context.Input<phi::DenseTensor>("Y");
    auto* in_z = context.Input<phi::DenseTensor>("Out");

--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CropNPUKernel : public framework::OpKernel<T> {
 public:
@@ -71,7 +69,7 @@ class CropNPUKernel : public framework::OpKernel<T> {
                            x->dims().size()));

      // shape memory maybe have gc.
-      Tensor tmp_shape(*shape);
+      phi::DenseTensor tmp_shape(*shape);
      tmp_shape.mutable_data<T>(ctx.GetPlace());

      const auto& runner =
@@ -90,7 +88,7 @@ class CropNPUKernel : public framework::OpKernel<T> {
                            "(%d) of the Input(X).",
                            shape_size.size(),
                            x->dims().size()));
-      Tensor tmp_shape(x->dtype());
+      phi::DenseTensor tmp_shape(x->dtype());
      tmp_shape.Resize(phi::make_ddim(shape_size));
      tmp_shape.mutable_data<T>(ctx.GetPlace());
      const auto& runner =

--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel : public framework::OpKernel<T> {
 public:
@@ -36,8 +34,8 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {

    int rank = x->dims().size();
    auto label_dims = labels->dims();
-    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
-    Tensor labels_2d, y_2d;
+    phi::DenseTensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    phi::DenseTensor labels_2d, y_2d;
    if (label_dims.size() < rank) {
      labels_2d.ShareDataWith(*labels);
      labels_2d.Resize({phi::product(label_dims), 1});

--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CTCAlignKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -26,8 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T, typename Type>
 bool is_continuous(const Type &weight_list) {
  bool continuous = true;
@@ -41,7 +39,7 @@ bool is_continuous(const Type &weight_list) {
  return continuous;
 }

-int size_sum(const std::vector<const Tensor *> &weight_list) {
+int size_sum(const std::vector<const phi::DenseTensor *> &weight_list) {
  int size = 0;
  for (size_t i = 0; i < weight_list.size(); ++i) {
    auto in_size = weight_list[i]->numel();
@@ -53,8 +51,8 @@ int size_sum(const std::vector<const Tensor *> &weight_list) {
 template <typename T>
 void weight_to_tensor(const platform::Place &place,
                      gpuStream_t stream,
-                      const std::vector<const Tensor *> &weight_list,
-                      Tensor *weight) {
+                      const std::vector<const phi::DenseTensor *> &weight_list,
+                      phi::DenseTensor *weight) {
  auto weight_data = weight->data<T>();
  int weight_offset = 0;
  for (size_t i = 0; i < weight_list.size(); ++i) {
@@ -72,11 +70,12 @@ void weight_to_tensor(const platform::Place &place,
 }

 template <typename T>
-void weight_to_tensor_list(const platform::Place &place,
+void weight_to_tensor_list(
+    const platform::Place &place,
    gpuStream_t stream,
-                           std::vector<Tensor *> *weight_grad,
-                           const std::vector<const Tensor *> &weight_input,
-                           const Tensor *weight) {
+    std::vector<phi::DenseTensor *> *weight_grad,
+    const std::vector<const phi::DenseTensor *> &weight_input,
+    const phi::DenseTensor *weight) {
  int weight_offset = 0;
  auto *weight_data = weight->data<T>();
  for (size_t i = 0; i < weight_input.size(); ++i) {
@@ -204,15 +203,15 @@ template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *x = ctx.Input<phi::DenseTensor>("Input");
-    const Tensor *init_h = ctx.Input<phi::DenseTensor>("InitH");
-    const Tensor *init_c = ctx.Input<phi::DenseTensor>("InitC");
+    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor *init_h = ctx.Input<phi::DenseTensor>("InitH");
+    const phi::DenseTensor *init_c = ctx.Input<phi::DenseTensor>("InitC");

-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
-    Tensor *last_h = ctx.Output<phi::DenseTensor>("LastH");
-    Tensor *last_c = ctx.Output<phi::DenseTensor>("LastC");
-    Tensor *reserve = ctx.Output<phi::DenseTensor>("Reserve");
-    Tensor *state_out = ctx.Output<phi::DenseTensor>("StateOut");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor *last_h = ctx.Output<phi::DenseTensor>("LastH");
+    phi::DenseTensor *last_c = ctx.Output<phi::DenseTensor>("LastC");
+    phi::DenseTensor *reserve = ctx.Output<phi::DenseTensor>("Reserve");
+    phi::DenseTensor *state_out = ctx.Output<phi::DenseTensor>("StateOut");

    const T *x_data = x->data<T>();
    const T *init_h_data = init_h->data<T>();
@@ -256,7 +255,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {

    size_t workspace_size;
    size_t reserve_size;
-    Tensor weight_whole;
+    phi::DenseTensor weight_whole;
    T *w_data = nullptr;
    int weight_numel;
    bool w_initialized = false;
@@ -272,7 +271,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
    if (!w_initialized) {
      auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
      bool continuous =
-          is_continuous<T, std::vector<const Tensor *>>(weight_list);
+          is_continuous<T, std::vector<const phi::DenseTensor *>>(weight_list);
      weight_numel = size_sum(weight_list);

      if (!continuous) {
@@ -288,7 +287,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
          for (size_t i = 0; i < weight_list.size(); ++i) {
            size_t len = weight_list[i]->numel();
            auto dim = weight_list[i]->dims();
-            const_cast<Tensor *>(weight_list[i])
+            const_cast<phi::DenseTensor *>(weight_list[i])
                ->ShareDataWith(
                    weight_whole.Slice(static_cast<int64_t>(offset),
                                       static_cast<int64_t>(offset + len)))
@@ -481,12 +480,12 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    int weight_numel = size_sum(weight_list);
    bool continuous =
-        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+        is_continuous<T, std::vector<const phi::DenseTensor *>>(weight_list);

    auto stream =
        reinterpret_cast<const phi::GPUContext &>(ctx.device_context())
            .stream();
-    Tensor weight_whole;
+    phi::DenseTensor weight_whole;
    T *weight_data = nullptr;

    if (!continuous) {
@@ -497,7 +496,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
      weight_data = const_cast<T *>(weight_list[0]->data<T>());
    }

-    Tensor weight_grad;
+    phi::DenseTensor weight_grad;
    phi::funcs::SetConstant<phi::GPUContext, T> zero;
    weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
    zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
@@ -559,7 +558,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
                  SequenceLength,
                  &workspace_size,
                  &reserve_size,
-                  const_cast<Tensor *>(state_out));
+                  const_cast<phi::DenseTensor *>(state_out));

    phi::DenseTensor workspace_data_;
    workspace_data_.mutable_data<uint8_t>(

--- a/paddle/fluid/operators/cumsum_op_mlu.cc
+++ b/paddle/fluid/operators/cumsum_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CumSumMLUKernel : public framework::OpKernel<T> {
 public:
@@ -34,7 +32,7 @@ class CumSumMLUKernel : public framework::OpKernel<T> {
    out->mutable_data<T>(ctx.GetPlace());

    phi::DenseTensor* input_ptr = const_cast<phi::DenseTensor*>(x);
-    Tensor flat_x(x->type());
+    phi::DenseTensor flat_x(x->type());
    if (flatten) {
      PADDLE_ENFORCE_EQ(
          axis,

--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 static void CumsumImp(const phi::DenseTensor& input,
                      phi::DenseTensor* output,
                      const framework::NPUAttributeMap& attr_input,
@@ -30,7 +28,7 @@ static void CumsumImp(const phi::DenseTensor& input,
          .stream();
  if (framework::TransToProtoVarType(input.dtype()) ==
      framework::proto::VarType::INT64) {
-    Tensor tmp_input;
+    phi::DenseTensor tmp_input;
    tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
    auto dst_acl_dtype =
        ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type()));
@@ -41,7 +39,7 @@ static void CumsumImp(const phi::DenseTensor& input,
                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
    cast_runner_1.Run(stream);

-    Tensor tmp_output;
+    phi::DenseTensor tmp_output;
    tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
    const auto& runner =
        NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
@@ -86,7 +84,7 @@ class CumSumNPUKernel : public framework::OpKernel<T> {
              -1,
              axis));

-      Tensor new_x(x->type());
+      phi::DenseTensor new_x(x->type());
      new_x.ShareDataWith(*x);

      new_x.Resize(phi::make_ddim({x->numel()}));

--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class CVMOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -22,7 +22,6 @@ namespace paddle {
 namespace operators {

 using phi::PADDLE_CUDA_NUM_THREADS;
-using Tensor = phi::DenseTensor;

 template <typename T>
 __global__ void CvmComputeKernel(const bool use_cvm,

--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void CvmComputeKernel(const bool use_cvm,
                      const int64_t item_width,

--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;

 template <typename T>
@@ -483,9 +482,9 @@ class DataNormGradOp : public framework::OperatorWithKernel {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Y@GRAD can not be found for computation"));
    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor *t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
    } else if (var->IsType<phi::DenseTensor>()) {
      t = &var->Get<phi::DenseTensor>();
    }
@@ -523,7 +522,7 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
        (data_layout == DataLayout::kNCHW ? x_dims[1]
                                          : x_dims[x_dims.size() - 1]);
    // init output
-    Tensor *d_x = nullptr;
+    phi::DenseTensor *d_x = nullptr;
    if (ctx.HasOutput(framework::GradVarName("X"))) {
      d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
    }
@@ -587,12 +586,12 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {

            EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
            EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-            Tensor dy_sum;
+            phi::DenseTensor dy_sum;
            dy_sum.Resize({C});
            dy_sum.mutable_data<T>(ctx.GetPlace());
            EigenVectorArrayMap<T> dy_sum_arr(
                dy_sum.mutable_data<T>(ctx.GetPlace()), C);
-            Tensor dy_mul_x_sub_mean_mul_invstd_sum;
+            phi::DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
            dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
            dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace());
            EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(

--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 using phi::PADDLE_CUDA_NUM_THREADS;

@@ -166,7 +165,7 @@ class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
    const int C = x_dims[1];

    // init output
-    Tensor *d_x = nullptr;
+    phi::DenseTensor *d_x = nullptr;
    if (ctx.HasOutput(framework::GradVarName("X"))) {
      d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
    }

--- a/paddle/fluid/operators/deformable_conv_op_mlu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class DeformableConvMLUKernel : public framework::OpKernel<T> {
 public:
@@ -58,29 +56,29 @@ class DeformableConvMLUKernel : public framework::OpKernel<T> {
                            im2col_step);

    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
    TransposeFromMLUTensor<T>(
        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);

-    Tensor trans_offset(offset->dtype());
+    phi::DenseTensor trans_offset(offset->dtype());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              offset,
                              &trans_offset,
                              true /*need_reshape_or_alloc*/);

-    Tensor trans_mask(mask->dtype());
+    phi::DenseTensor trans_mask(mask->dtype());
    TransposeFromMLUTensor<T>(
        ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);

-    Tensor trans_filter(filter->dtype());
+    phi::DenseTensor trans_filter(filter->dtype());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              filter,
                              &trans_filter,
                              true /*need_reshape_or_alloc*/);

-    Tensor tmp_output(output->dtype());
+    phi::DenseTensor tmp_output(output->dtype());
    auto output_dims = output->dims();
    tmp_output.mutable_data<T>(
        {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
@@ -167,54 +165,54 @@ class DeformableConvGradMLUKernel : public framework::OpKernel<T> {
                            groups,
                            im2col_step);

-    Tensor tmp_input_grad;
+    phi::DenseTensor tmp_input_grad;
    auto input_dims = input->dims();
    tmp_input_grad.mutable_data<T>(
        {input_dims[0], input_dims[2], input_dims[3], input_dims[1]},
        ctx.GetPlace());

-    Tensor tmp_filter_grad;
+    phi::DenseTensor tmp_filter_grad;
    auto filter_dims = filter->dims();
    tmp_filter_grad.mutable_data<T>(
        {filter_dims[0], filter_dims[2], filter_dims[3], filter_dims[1]},
        ctx.GetPlace());

-    Tensor tmp_offset_grad;
+    phi::DenseTensor tmp_offset_grad;
    auto offset_dims = offset->dims();
    tmp_offset_grad.mutable_data<T>(
        {offset_dims[0], offset_dims[2], offset_dims[3], offset_dims[1]},
        ctx.GetPlace());

-    Tensor tmp_mask_grad;
+    phi::DenseTensor tmp_mask_grad;
    auto mask_dims = mask->dims();
    tmp_mask_grad.mutable_data<T>(
        {mask_dims[0], mask_dims[2], mask_dims[3], mask_dims[1]},
        ctx.GetPlace());

    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    Tensor trans_output_grad(output_grad->dtype());
+    phi::DenseTensor trans_output_grad(output_grad->dtype());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              output_grad,
                              &trans_output_grad,
                              true /*need_reshape_or_alloc*/);

-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
    TransposeFromMLUTensor<T>(
        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);

-    Tensor trans_offset(offset->dtype());
+    phi::DenseTensor trans_offset(offset->dtype());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              offset,
                              &trans_offset,
                              true /*need_reshape_or_alloc*/);

-    Tensor trans_mask(mask->dtype());
+    phi::DenseTensor trans_mask(mask->dtype());
    TransposeFromMLUTensor<T>(
        ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);

-    Tensor trans_filter(filter->dtype());
+    phi::DenseTensor trans_filter(filter->dtype());
    TransposeFromMLUTensor<T>(ctx,
                              perm_to_nhwc,
                              filter,

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -39,7 +39,6 @@
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using phi::PADDLE_CUDA_NUM_THREADS;

 static inline int GET_BLOCKS(const int N) {

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -33,8 +33,6 @@
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 T bilinear_interp(
    const T* data, const T x, const T y, const int width, const int height) {
@@ -518,7 +516,7 @@ class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
    const int num_classes = no_trans ? 1 : channels_trans / 2;
    const int channels_each_class =
        no_trans ? output_dim : output_dim / num_classes;
-    Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
    roi_batch_id_list.Resize({num_rois});
    int* roi_batch_id_data =
        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());

--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -30,8 +30,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

 int const kThreadsPerBlock = sizeof(uint64_t) * 8;
@@ -47,11 +45,11 @@ struct RangeInitFunctor {

 template <typename T>
 static void SortDescending(const phi::GPUContext &ctx,
-                           const Tensor &value,
-                           Tensor *value_out,
-                           Tensor *index_out) {
+                           const phi::DenseTensor &value,
+                           phi::DenseTensor *value_out,
+                           phi::DenseTensor *index_out) {
  int num = static_cast<int>(value.numel());
-  Tensor index_in_t;
+  phi::DenseTensor index_in_t;
  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
  platform::ForRange<phi::GPUContext> for_range(ctx, num);
  for_range(RangeInitFunctor{0, 1, idx_in});
@@ -287,10 +285,10 @@ static __global__ void NMSKernel(const int n_boxes,

 template <typename T>
 static void NMS(const phi::GPUContext &ctx,
-                const Tensor &proposals,
-                const Tensor &sorted_indices,
+                const phi::DenseTensor &proposals,
+                const phi::DenseTensor &sorted_indices,
                const T nms_threshold,
-                Tensor *keep_out,
+                phi::DenseTensor *keep_out,
                bool pixel_offset = true) {
  int boxes_num = proposals.dims()[0];
  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);

--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class BipartiteMatchOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -234,7 +232,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
      auto lod = dist_mat->lod().back();
      for (size_t i = 0; i < lod.size() - 1; ++i) {
        if (lod[i + 1] > lod[i]) {
-          Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+          phi::DenseTensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
          BipartiteMatch(one_ins, indices + i * col, dist + i * col);
          if (type == "per_prediction") {
            ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);

--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using LoDTenso = phi::DenseTensor;

 static constexpr int ImInfoSize = 3;

--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class BoxClipKernel : public framework::OpKernel<T> {
 public:
@@ -42,9 +40,10 @@ class BoxClipKernel : public framework::OpKernel<T> {
    auto box_lod = input_box->lod().back();
    int64_t n = static_cast<int64_t>(box_lod.size() - 1);
    for (int i = 0; i < n; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
-      Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
+      phi::DenseTensor output_slice =
+          output_box->Slice(box_lod[i], box_lod[i + 1]);
      ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice);
    }
  }

--- a/paddle/fluid/operators/detection/box_coder_op_npu.cc
+++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct BoxCoderFunction {
 public:
@@ -28,31 +26,31 @@ struct BoxCoderFunction {
    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
                 .stream();
  }
-  Tensor Adds(const phi::DenseTensor& x, float scalar) {
-    Tensor y;
+  phi::DenseTensor Adds(const phi::DenseTensor& x, float scalar) {
+    phi::DenseTensor y;
    y.mutable_data<T>(x.dims(), place);
    const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
    runner.Run(stream);
    return y;
  }
-  Tensor Muls(const phi::DenseTensor& x, float scalar) {
-    Tensor y;
+  phi::DenseTensor Muls(const phi::DenseTensor& x, float scalar) {
+    phi::DenseTensor y;
    y.mutable_data<T>(x.dims(), place);
    const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
    runner.Run(stream);
    return y;
  }
-  Tensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
-    Tensor z;
+  phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+    phi::DenseTensor z;
    z.mutable_data<T>(x.dims(), place);
    const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
    runner.Run(stream);
    return z;
  }
-  Tensor SubWithBroadCast(const phi::DenseTensor& x,
+  phi::DenseTensor SubWithBroadCast(const phi::DenseTensor& x,
                                    const phi::DenseTensor& y,
                                    const framework::DDim& shape) {
-    Tensor z;
+    phi::DenseTensor z;
    z.mutable_data<T>(shape, place);
    const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
    runner.Run(stream);
@@ -66,10 +64,10 @@ struct BoxCoderFunction {
    const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
    runner.Run(stream);
  }
-  Tensor DivWithBroadCast(const phi::DenseTensor& x,
+  phi::DenseTensor DivWithBroadCast(const phi::DenseTensor& x,
                                    const phi::DenseTensor& y,
                                    const framework::DDim& shape) {
-    Tensor z;
+    phi::DenseTensor z;
    DivWithBroadCastVoid(x, y, shape, &z);
    return z;
  }
@@ -81,10 +79,10 @@ struct BoxCoderFunction {
    const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
    runner.Run(stream);
  }
-  Tensor MulWithBroadCast(const phi::DenseTensor& x,
+  phi::DenseTensor MulWithBroadCast(const phi::DenseTensor& x,
                                    const phi::DenseTensor& y,
                                    const framework::DDim& shape) {
-    Tensor z;
+    phi::DenseTensor z;
    MulWithBroadCastVoid(x, y, shape, &z);
    return z;
  }
@@ -96,36 +94,36 @@ struct BoxCoderFunction {
    const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
    runner.Run(stream);
  }
-  Tensor AddWithBroadCast(const phi::DenseTensor& x,
+  phi::DenseTensor AddWithBroadCast(const phi::DenseTensor& x,
                                    const phi::DenseTensor& y,
                                    const framework::DDim& shape) {
-    Tensor z;
+    phi::DenseTensor z;
    AddWithBroadCastVoid(x, y, shape, &z);
    return z;
  }
-  Tensor Abs(const phi::DenseTensor& x) {
-    Tensor y;
+  phi::DenseTensor Abs(const phi::DenseTensor& x) {
+    phi::DenseTensor y;
    y.mutable_data<T>(x.dims(), place);
    const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
    runner.Run(stream);
    return y;
  }
-  Tensor Log(const phi::DenseTensor& x) {
-    Tensor t_x_m1 = Adds(x, -1);
-    Tensor y;
+  phi::DenseTensor Log(const phi::DenseTensor& x) {
+    phi::DenseTensor t_x_m1 = Adds(x, -1);
+    phi::DenseTensor y;
    y.mutable_data<T>(x.dims(), place);
    const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
    runner.Run(stream);
    return y;
  }
-  Tensor Exp(const phi::DenseTensor& x) {
-    Tensor y;
+  phi::DenseTensor Exp(const phi::DenseTensor& x) {
+    phi::DenseTensor y;
    y.mutable_data<T>(x.dims(), place);
    const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
    runner.Run(stream);
    return y;
  }
-  Tensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+  phi::DenseTensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
    auto dim_x = x.dims();
    auto dim_y = y.dims();
    PADDLE_ENFORCE_EQ(
@@ -145,7 +143,7 @@ struct BoxCoderFunction {
                                          "got dim_x[1] = %d, dim_y[0] = %d.",
                                          dim_x[1],
                                          dim_y[0]));
-    Tensor z;
+    phi::DenseTensor z;
    z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
    const auto& runner =
        NpuOpRunner("MatMul",
@@ -155,7 +153,7 @@ struct BoxCoderFunction {
    runner.Run(stream);
    return z;
  }
-  void ConcatVoid(const std::vector<Tensor>& inputs,
+  void ConcatVoid(const std::vector<phi::DenseTensor>& inputs,
                  const framework::DDim& shape_out,
                  int axis,
                  phi::DenseTensor* output) {
@@ -172,18 +170,18 @@ struct BoxCoderFunction {
    runner.AddInputNames(names);
    runner.Run(stream);
  }
-  Tensor Concat(const std::vector<Tensor>& inputs,
+  phi::DenseTensor Concat(const std::vector<phi::DenseTensor>& inputs,
                          const framework::DDim& shape_out,
                          int axis) {
-    Tensor output;
+    phi::DenseTensor output;
    ConcatVoid(inputs, shape_out, axis, &output);
    return output;
  }
-  Tensor Slice(const phi::DenseTensor& x,
+  phi::DenseTensor Slice(const phi::DenseTensor& x,
                         const std::vector<int>& offsets,
                         const std::vector<int>& size,
                         const framework::DDim& shape) {
-    Tensor y;
+    phi::DenseTensor y;
    y.mutable_data<T>(shape, place);
    const auto& runner =
        NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
@@ -218,8 +216,8 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
  auto M = pb->dims()[0];
  auto N = tb->dims()[0];
  auto shape_0 = phi::make_ddim({4, 2});
-  Tensor m_diff;
-  Tensor m_aver;
+  phi::DenseTensor m_diff;
+  phi::DenseTensor m_aver;
  std::vector<T> vec_diff = {static_cast<T>(-1),
                             static_cast<T>(0),
                             static_cast<T>(0),
@@ -240,10 +238,10 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);

  BoxCoderFunction<T> F(ctx);
-  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
-  Tensor tb_xy = F.Dot(*tb, m_aver);
-  Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor tb_xy = F.Dot(*tb, m_aver);
+  phi::DenseTensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));

  pb_xy.Resize({1, M, 2});
  pb_wh.Resize({1, M, 2});
@@ -253,15 +251,16 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
  auto shape_half = phi::make_ddim({N, M, 2});
  auto shape_full = phi::make_ddim({N, M, 4});

-  Tensor out_xy_0 = F.DivWithBroadCast(
+  phi::DenseTensor out_xy_0 = F.DivWithBroadCast(
      F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
-  Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
-  Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
+  phi::DenseTensor out_wh_0 =
+      F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
+  phi::DenseTensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);

  if (pbv) {
    F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
  } else {
-    Tensor t_var;
+    phi::DenseTensor t_var;
    std::vector<T> vec_var(4);
    for (auto i = 0; i < 4; i++) {
      vec_var[i] = static_cast<T>(variance[i]);
@@ -281,8 +280,8 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
                 int axis,
                 phi::DenseTensor* out) {
  auto shape_0 = phi::make_ddim({4, 2});
-  Tensor m_diff;
-  Tensor m_aver;
+  phi::DenseTensor m_diff;
+  phi::DenseTensor m_aver;
  std::vector<T> vec_diff = {static_cast<T>(-1),
                             static_cast<T>(0),
                             static_cast<T>(0),
@@ -303,8 +302,8 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);

  BoxCoderFunction<T> F(ctx);
-  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
  auto pb_resize_shape = axis == 0 ? phi::make_ddim({1, pb->dims()[0], 2})
                                   : phi::make_ddim({pb->dims()[0], 1, 2});
  pb_xy.Resize(pb_resize_shape);
@@ -313,18 +312,22 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
  auto tbox_slice_shape = phi::make_ddim({tb->dims()[0], tb->dims()[1], 2});
  std::vector<int> tbox_slice_size = {
      static_cast<int>(tb->dims()[0]), static_cast<int>(tb->dims()[1]), 2};
-  Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
-  Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
+  phi::DenseTensor tbox01 =
+      F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
+  phi::DenseTensor tbox23 =
+      F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);

-  Tensor tb_xy;
-  Tensor tb_wh;
+  phi::DenseTensor tb_xy;
+  phi::DenseTensor tb_wh;
  if (pbv) {
    auto pbvt_slice_shape = phi::make_ddim({pbv->dims()[0], 2});
    auto pbvt_resize_shape = axis == 0 ? phi::make_ddim({1, pbv->dims()[0], 2})
                                       : phi::make_ddim({pbv->dims()[0], 1, 2});
    std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
-    Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
-    Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
+    phi::DenseTensor pbv_t01 =
+        F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
+    phi::DenseTensor pbv_t23 =
+        F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
    pbv_t01.Resize(pbvt_resize_shape);
    pbv_t23.Resize(pbvt_resize_shape);

@@ -345,7 +348,7 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
                           &tb_xy);
    F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
  } else {
-    Tensor t_var01, t_var23;
+    phi::DenseTensor t_var01, t_var23;
    auto t_var_shape = phi::make_ddim({1, 1, 2});
    std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
                                static_cast<T>(variance[1])};
@@ -366,9 +369,9 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
        tbox_slice_shape,
        &tb_wh);
  }
-  Tensor obox01 =
+  phi::DenseTensor obox01 =
      F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
-  Tensor obox23 =
+  phi::DenseTensor obox23 =
      F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
             (norm ? 0 : -1));
  F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);

--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -16,7 +16,6 @@ limitations under the License.*/
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 class CollectFpnProposalsOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -33,8 +33,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 64;
 static constexpr int kNumMaxinumNumBlocks = 4096;

@@ -74,13 +72,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {

    int real_post_num = min(post_nms_topN, total_roi_num);
    fpn_rois->mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor concat_rois;
-    Tensor concat_scores;
+    phi::DenseTensor concat_rois;
+    phi::DenseTensor concat_scores;
    T* concat_rois_data = concat_rois.mutable_data<T>(
        {total_roi_num, kBBoxSize}, dev_ctx.GetPlace());
    T* concat_scores_data =
        concat_scores.mutable_data<T>({total_roi_num, 1}, dev_ctx.GetPlace());
-    Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
    roi_batch_id_list.Resize({total_roi_num});
    int* roi_batch_id_data =
        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
@@ -130,20 +128,20 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
    }

    // copy batch id list to GPU
-    Tensor roi_batch_id_list_gpu;
+    phi::DenseTensor roi_batch_id_list_gpu;
    framework::TensorCopy(
        roi_batch_id_list, dev_ctx.GetPlace(), &roi_batch_id_list_gpu);

-    Tensor index_in_t;
+    phi::DenseTensor index_in_t;
    int* idx_in =
        index_in_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
    platform::ForRange<phi::GPUContext> for_range_total(dev_ctx, total_roi_num);
    for_range_total(RangeInitFunctor{0, 1, idx_in});

-    Tensor keys_out_t;
+    phi::DenseTensor keys_out_t;
    T* keys_out =
        keys_out_t.mutable_data<T>({total_roi_num}, dev_ctx.GetPlace());
-    Tensor index_out_t;
+    phi::DenseTensor index_out_t;
    int* idx_out =
        index_out_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());

@@ -175,21 +173,21 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
                                                      sizeof(T) * 8,
                                                      dev_ctx.stream());
    index_out_t.Resize({real_post_num});
-    Tensor sorted_rois;
+    phi::DenseTensor sorted_rois;
    sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor sorted_batch_id;
+    phi::DenseTensor sorted_batch_id;
    sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
    phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
    phi::funcs::GPUGather<int>(
        dev_ctx, roi_batch_id_list_gpu, index_out_t, &sorted_batch_id);

-    Tensor batch_index_t;
+    phi::DenseTensor batch_index_t;
    int* batch_idx_in =
        batch_index_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
    platform::ForRange<phi::GPUContext> for_range_post(dev_ctx, real_post_num);
    for_range_post(RangeInitFunctor{0, 1, batch_idx_in});

-    Tensor out_id_t;
+    phi::DenseTensor out_id_t;
    int* out_id_data =
        out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
    // Determine temporary device storage requirements
@@ -222,7 +220,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {

    phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);

-    Tensor length_lod;
+    phi::DenseTensor length_lod;
    int* length_lod_data =
        length_lod.mutable_data<int>({lod_size}, dev_ctx.GetPlace());
    phi::funcs::SetConstant<phi::GPUContext, int> set_zero;

--- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using fp16 = paddle::platform::float16;

 template <typename T>
@@ -89,7 +88,7 @@ struct DensityPriorBoxFunction {
    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
    runner.Run(stream);
  }
-  void Concat(const std::vector<Tensor>& inputs,
+  void Concat(const std::vector<phi::DenseTensor>& inputs,
              int axis,
              phi::DenseTensor* output) {
    //  output should be init first
@@ -131,14 +130,14 @@ struct DensityPriorBoxFunction {
  platform::Place place;
  aclrtStream stream;
  const framework::ExecutionContext& ctx;
-  Tensor t0;
-  Tensor t1;
-  Tensor tn;
+  phi::DenseTensor t0;
+  phi::DenseTensor t1;
+  phi::DenseTensor tn;
 };

 template <>
 void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  Tensor x_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor x_fp32(experimental::DataType::FLOAT32);
  x_fp32.mutable_data<float>(x->dims(), place);
  FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
  const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
@@ -149,7 +148,7 @@ void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
 template <>
 void DensityPriorBoxFunction<fp16>::FloatVec2Tsr(const std::vector<float>& vec,
                                                 phi::DenseTensor* tsr_dst) {
-  Tensor tsr_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor tsr_fp32(experimental::DataType::FLOAT32);
  tsr_fp32.mutable_data<float>(tsr_dst->dims(), place);
  framework::TensorFromVector<float>(vec, ctx.device_context(), &tsr_fp32);
  ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
@@ -185,9 +184,9 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    DensityPriorBoxFunction<T> F(ctx);

-    Tensor h(_type);
+    phi::DenseTensor h(_type);
    h.mutable_data<T>({layer_h}, place);
-    Tensor w(_type);
+    phi::DenseTensor w(_type);
    w.mutable_data<T>({layer_w}, place);
    F.Arange(layer_h, &h);
    F.Arange(layer_w, &w);
@@ -203,11 +202,11 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
    for (size_t i = 0; i < densities.size(); ++i) {
      num_priors_per_ratio += densities[i] * densities[i];
    }
-    Tensor di(_type);
-    Tensor dj(_type);
-    Tensor shifts(_type);
-    Tensor box_w_ratio(_type);
-    Tensor box_h_ratio(_type);
+    phi::DenseTensor di(_type);
+    phi::DenseTensor dj(_type);
+    phi::DenseTensor shifts(_type);
+    phi::DenseTensor box_w_ratio(_type);
+    phi::DenseTensor box_h_ratio(_type);
    di.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
    dj.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
    shifts.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
@@ -220,19 +219,21 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
      //  Range = start:start+ratios_size*density_sqr, density = densities[i]
      int density_sqr = densities[i] * densities[i];
      //  shifts[Range] = [step_average/density]*ratios_size*density_sqr
-      Tensor shifts_part =
+      phi::DenseTensor shifts_part =
          shifts.Slice(start, start + ratios_size * density_sqr);
      FillNpuTensorWithConstant<T>(&shifts_part,
                                   static_cast<T>(step_average / densities[i]));

      //  di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size
      //  dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size
-      Tensor di_part = di.Slice(start, start + ratios_size * density_sqr);
-      Tensor dj_part = dj.Slice(start, start + ratios_size * density_sqr);
+      phi::DenseTensor di_part =
+          di.Slice(start, start + ratios_size * density_sqr);
+      phi::DenseTensor dj_part =
+          dj.Slice(start, start + ratios_size * density_sqr);
      if (densities[i] > 1) {
        di_part.Resize({ratios_size, densities[i], densities[i]});
        dj_part.Resize({ratios_size, densities[i], densities[i]});
-        Tensor range_n(_type);
+        phi::DenseTensor range_n(_type);
        range_n.mutable_data<T>({densities[i]}, place);
        F.Arange(densities[i], &range_n);
        range_n.Resize({1, densities[i], 1});
@@ -254,9 +255,9 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
        //  Range_mini = start_box_ratio:start_box_ratio+density_sqr
        //  box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)]  * density_sqr
        //  box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)]  * density_sqr
-        Tensor box_h_ratio_part =
+        phi::DenseTensor box_h_ratio_part =
            box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
-        Tensor box_w_ratio_part =
+        phi::DenseTensor box_w_ratio_part =
            box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
        FillNpuTensorWithConstant<T>(&box_w_ratio_part,
                                     static_cast<T>(fixed_sizes[i] * sqrt(ar)));
@@ -274,8 +275,8 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {

    //  c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts
    //  c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts
-    Tensor c_x(_type);
-    Tensor c_y(_type);
+    phi::DenseTensor c_x(_type);
+    phi::DenseTensor c_y(_type);
    auto dim0 =
        phi::make_ddim({1, layer_w, ratios_size * num_priors_per_ratio, 1});
    auto dim1 =
@@ -301,17 +302,17 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
    F.Muls(&box_w_ratio, static_cast<float>(0.5), &box_w_ratio);
    F.Muls(&box_h_ratio, static_cast<float>(0.5), &box_h_ratio);

-    Tensor zero_t(_type);
-    Tensor one_t(_type);
+    phi::DenseTensor zero_t(_type);
+    phi::DenseTensor one_t(_type);
    zero_t.mutable_data<T>({1}, place);
    one_t.mutable_data<T>({1}, place);
    FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
    FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));

-    Tensor outbox0(_type);
-    Tensor outbox1(_type);
-    Tensor outbox2(_type);
-    Tensor outbox3(_type);
+    phi::DenseTensor outbox0(_type);
+    phi::DenseTensor outbox1(_type);
+    phi::DenseTensor outbox2(_type);
+    phi::DenseTensor outbox3(_type);
    outbox0.mutable_data<T>(dim0, place);
    outbox1.mutable_data<T>(dim1, place);
    outbox2.mutable_data<T>(dim0, place);
@@ -349,17 +350,17 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
        {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4});
    boxes->mutable_data<T>(place);
    vars->mutable_data<T>(place);
-    Tensor boxes_share(_type);
-    Tensor vars_share(_type);
+    phi::DenseTensor boxes_share(_type);
+    phi::DenseTensor vars_share(_type);
    boxes_share.ShareDataWith(*boxes);
    boxes_share.Resize(out_dim);
    vars_share.ShareDataWith(*vars);
    vars_share.Resize(out_dim);

-    Tensor box0(_type);
-    Tensor box1(_type);
-    Tensor box2(_type);
-    Tensor box3(_type);
+    phi::DenseTensor box0(_type);
+    phi::DenseTensor box1(_type);
+    phi::DenseTensor box2(_type);
+    phi::DenseTensor box3(_type);
    // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1}
    out_dim[3] = 1;
    box0.mutable_data<T>(out_dim, place);
@@ -377,7 +378,7 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {

    std::vector<int> multiples = {
        layer_h, layer_w, ratios_size * num_priors_per_ratio, 1};
-    Tensor variances_t(_type);
+    phi::DenseTensor variances_t(_type);
    //  variances.size() == 4
    variances_t.mutable_data<T>({4}, place);
    F.FloatVec2Tsr(variances, &variances_t);

--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 const int kBoxDim = 4;

 template <typename T>
@@ -151,7 +150,8 @@ static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
 }

 template <typename T>
-std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
+std::vector<phi::DenseTensor> SampleMaskForOneImage(
+    const phi::CPUContext& ctx,
    const phi::DenseTensor& im_info,
    const phi::DenseTensor& gt_classes,
    const phi::DenseTensor& is_crowd,
@@ -218,15 +218,15 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
  int gt_num = mask_gt_inds.size();
  int fg_num = fg_inds.size();

-  Tensor boxes_from_polys;
+  phi::DenseTensor boxes_from_polys;
  boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
  Poly2Boxes(gt_polys, boxes_from_polys.data<T>());

  std::vector<int> roi_has_mask =
      std::vector<int>(fg_inds.begin(), fg_inds.end());
-  Tensor mask_class_labels;
-  Tensor masks;
-  Tensor rois_fg;
+  phi::DenseTensor mask_class_labels;
+  phi::DenseTensor masks;
+  phi::DenseTensor rois_fg;

  auto im_scale = im_info.data<T>()[2];
  if (fg_num > 0) {
@@ -251,7 +251,7 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
      rois_fg_data[k] = rois_fg_data[k] / im_scale;
    }

-    Tensor overlaps_bbfg_bbpolys;
+    phi::DenseTensor overlaps_bbfg_bbpolys;
    overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
    BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);

@@ -306,7 +306,7 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
    roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
  }

-  Tensor masks_expand;
+  phi::DenseTensor masks_expand;
  ExpandMaskTarget<T>(
      ctx, masks, mask_class_labels, resolution, num_classes, &masks_expand);

@@ -315,13 +315,13 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
    rois_fg_data[k] = rois_fg_data[k] * im_scale;
  }

-  Tensor roi_has_mask_t;
+  phi::DenseTensor roi_has_mask_t;
  int roi_has_mask_size = roi_has_mask.size();
  int* roi_has_mask_data =
      roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
  std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);

-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
  res.emplace_back(rois_fg);
  res.emplace_back(roi_has_mask_t);
  res.emplace_back(masks_expand);
@@ -405,23 +405,23 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
        lod0.emplace_back(num_mask);
        continue;
      }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor gt_classes_slice =
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor gt_classes_slice =
          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor label_int32_slice =
+      phi::DenseTensor label_int32_slice =
          label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
-      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
+      phi::DenseTensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);

      auto sub_lod_and_offset =
          framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
      auto lod_length = sub_lod_and_offset.first;
      size_t s = sub_lod_and_offset.second.first;
      size_t e = sub_lod_and_offset.second.second;
-      Tensor gt_segms_slice = gt_segms->Slice(s, e);
+      phi::DenseTensor gt_segms_slice = gt_segms->Slice(s, e);

-      std::vector<Tensor> tensor_output =
+      std::vector<phi::DenseTensor> tensor_output =
          SampleMaskForOneImage<T>(dev_ctx,
                                   im_info_slice,
                                   gt_classes_slice,
@@ -433,9 +433,9 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
                                   resolution,
                                   lod_length);

-      Tensor sampled_mask_rois = tensor_output[0];
-      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
-      Tensor sampled_mask_int32 = tensor_output[2];
+      phi::DenseTensor sampled_mask_rois = tensor_output[0];
+      phi::DenseTensor sampled_roi_has_mask_int32 = tensor_output[1];
+      phi::DenseTensor sampled_mask_int32 = tensor_output[2];

      AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
      AppendMask<int>(

--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 const int kBoxDim = 4;

 template <typename T>
@@ -174,7 +173,7 @@ void Concat(const phi::CPUContext& context,
            const phi::DenseTensor& in_tensor_b,
            phi::DenseTensor* out_tensor) {
  int axis = 0;
-  std::vector<Tensor> inputs;
+  std::vector<phi::DenseTensor> inputs;
  inputs.emplace_back(in_tensor_a);
  inputs.emplace_back(in_tensor_b);
  math::ConcatFunctor<phi::CPUContext, T> concat_functor;
@@ -300,7 +299,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
                       phi::DenseTensor* sampled_max_overlap) {
  int fg_num = fg_inds.size();
  int bg_num = bg_inds.size();
-  Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
+  phi::DenseTensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
  int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
  int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
  int* gt_box_inds_data =
@@ -312,7 +311,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
  std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data);
  std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data);

-  Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
+  phi::DenseTensor fg_boxes, bg_boxes, fg_labels, bg_labels;
  fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
  phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
  bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
@@ -325,7 +324,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
  phi::funcs::set_constant(context, &bg_labels, 0);
  Concat<int>(context, fg_labels, bg_labels, sampled_labels);

-  Tensor fg_max_overlap, bg_max_overlap;
+  phi::DenseTensor fg_max_overlap, bg_max_overlap;
  fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
  phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
  bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
@@ -334,7 +333,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
 }

 template <typename T>
-std::vector<Tensor> SampleRoisForOneImage(
+std::vector<phi::DenseTensor> SampleRoisForOneImage(
    const phi::CPUContext& context,
    const phi::DenseTensor& rpn_rois_in,
    const phi::DenseTensor& gt_classes,
@@ -355,7 +354,7 @@ std::vector<Tensor> SampleRoisForOneImage(
    const phi::DenseTensor& max_overlap) {
  // 1.1 map to original image
  auto im_scale = im_info.data<T>()[2];
-  Tensor rpn_rois;
+  phi::DenseTensor rpn_rois;
  rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
  const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
  T* rpn_rois_dt = rpn_rois.data<T>();
@@ -367,10 +366,10 @@ std::vector<Tensor> SampleRoisForOneImage(
  int proposals_num = 1;

  if (is_cascade_rcnn) {
-    Tensor keep;
+    phi::DenseTensor keep;
    FilterRoIs<T>(context, rpn_rois, max_overlap, &keep);
-    Tensor roi_filter;
-    // Tensor box_filter;
+    phi::DenseTensor roi_filter;
+    // phi::DenseTensor box_filter;
    if (keep.numel() == 0) {
      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
      roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
@@ -389,16 +388,16 @@ std::vector<Tensor> SampleRoisForOneImage(
  // 1.2 compute overlaps
  proposals_num += gt_boxes.dims()[0];

-  Tensor proposal_to_gt_overlaps;
+  phi::DenseTensor proposal_to_gt_overlaps;
  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                          context.GetPlace());

-  Tensor boxes;
+  phi::DenseTensor boxes;
  boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
  BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);

-  Tensor proposal_with_max_overlap;
+  phi::DenseTensor proposal_with_max_overlap;
  proposal_with_max_overlap.mutable_data<T>({proposals_num},
                                            context.GetPlace());

@@ -423,7 +422,8 @@ std::vector<Tensor> SampleRoisForOneImage(
  std::vector<int> mapped_gt_inds = fg_bg_gt[2];  // mapped_gt_labels

  // Gather boxes and labels
-  Tensor sampled_boxes, sampled_labels, sampled_gts, sampled_max_overlap;
+  phi::DenseTensor sampled_boxes, sampled_labels, sampled_gts,
+      sampled_max_overlap;
  int fg_num = fg_inds.size();
  int bg_num = bg_inds.size();
  int boxes_num = fg_num + bg_num;
@@ -446,7 +446,7 @@ std::vector<Tensor> SampleRoisForOneImage(
                       &sampled_max_overlap);

  // Compute targets
-  Tensor bbox_targets_single;
+  phi::DenseTensor bbox_targets_single;
  bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
  BoxToDelta<T>(fg_num,
                sampled_boxes,
@@ -456,14 +456,14 @@ std::vector<Tensor> SampleRoisForOneImage(
                &bbox_targets_single);

  // Scale rois
-  Tensor sampled_rois;
+  phi::DenseTensor sampled_rois;
  sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
  auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
  auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
  sampled_rois_et = sampled_boxes_et * im_scale;

  // Expand box targets
-  Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
+  phi::DenseTensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
  framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums});
  bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
  bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
@@ -500,7 +500,7 @@ std::vector<Tensor> SampleRoisForOneImage(
      bbox_outside_weights_data[dst_idx + 3] = 1;
    }
  }
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
  res.emplace_back(sampled_rois);
  res.emplace_back(sampled_labels);
  res.emplace_back(bbox_targets);
@@ -610,16 +610,16 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
        lod0.emplace_back(num_rois);
        continue;
      }
-      Tensor rpn_rois_slice =
+      phi::DenseTensor rpn_rois_slice =
          rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
-      Tensor gt_classes_slice =
+      phi::DenseTensor gt_classes_slice =
          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor max_overlap_slice;
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor max_overlap_slice;
      if (is_cascade_rcnn) {
        auto* max_overlap = context.Input<phi::DenseTensor>("MaxOverlap");
        max_overlap_slice =
@@ -628,7 +628,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
        max_overlap_slice.mutable_data<T>({rpn_rois_slice.dims()[0]},
                                          context.GetPlace());
      }
-      std::vector<Tensor> tensor_output =
+      std::vector<phi::DenseTensor> tensor_output =
          SampleRoisForOneImage<T>(dev_ctx,
                                   rpn_rois_slice,
                                   gt_classes_slice,
@@ -647,12 +647,12 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
                                   is_cascade_rcnn,
                                   is_cls_agnostic,
                                   max_overlap_slice);
-      Tensor sampled_rois = tensor_output[0];
-      Tensor sampled_labels_int32 = tensor_output[1];
-      Tensor sampled_bbox_targets = tensor_output[2];
-      Tensor sampled_bbox_inside_weights = tensor_output[3];
-      Tensor sampled_bbox_outside_weights = tensor_output[4];
-      Tensor sampled_max_overlap = tensor_output[5];
+      phi::DenseTensor sampled_rois = tensor_output[0];
+      phi::DenseTensor sampled_labels_int32 = tensor_output[1];
+      phi::DenseTensor sampled_bbox_targets = tensor_output[2];
+      phi::DenseTensor sampled_bbox_inside_weights = tensor_output[3];
+      phi::DenseTensor sampled_bbox_outside_weights = tensor_output[4];
+      phi::DenseTensor sampled_max_overlap = tensor_output[5];

      AppendRois<T>(rois, kBoxDim * num_rois, &sampled_rois);
      AppendRois<int>(labels_int32, num_rois, &sampled_labels_int32);

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class GenerateProposalsOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -115,7 +113,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
                              context.GetPlace());
    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());

-    Tensor bbox_deltas_swap, scores_swap;
+    phi::DenseTensor bbox_deltas_swap, scores_swap;
    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
                                     dev_ctx.GetPlace());
    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
@@ -136,14 +134,14 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {

    int64_t num_proposals = 0;
    for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      phi::DenseTensor scores_slice = scores_swap.Slice(i, i + 1);

      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
      scores_slice.Resize({h_score * w_score * c_score, 1});

-      std::pair<Tensor, Tensor> tensor_pair =
+      std::pair<phi::DenseTensor, phi::DenseTensor> tensor_pair =
          ProposalForOneImage(dev_ctx,
                              im_info_slice,
                              anchors,
@@ -155,8 +153,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
                              nms_thresh,
                              min_size,
                              eta);
-      Tensor &proposals = tensor_pair.first;
-      Tensor &scores = tensor_pair.second;
+      phi::DenseTensor &proposals = tensor_pair.first;
+      phi::DenseTensor &scores = tensor_pair.second;

      AppendProposals(rpn_rois, 4 * num_proposals, proposals);
      AppendProposals(rpn_roi_probs, num_proposals, scores);
@@ -179,13 +177,13 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    rpn_roi_probs->Resize({num_proposals, 1});
  }

-  std::pair<Tensor, Tensor> ProposalForOneImage(
+  std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
      const phi::CPUContext &ctx,
-      const Tensor &im_info_slice,
-      const Tensor &anchors,
-      const Tensor &variances,
-      const Tensor &bbox_deltas_slice,  // [M, 4]
-      const Tensor &scores_slice,       // [N, 1]
+      const phi::DenseTensor &im_info_slice,
+      const phi::DenseTensor &anchors,
+      const phi::DenseTensor &variances,
+      const phi::DenseTensor &bbox_deltas_slice,  // [M, 4]
+      const phi::DenseTensor &scores_slice,       // [N, 1]
      int pre_nms_top_n,
      int post_nms_top_n,
      float nms_thresh,
@@ -194,7 +192,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    auto *scores_data = scores_slice.data<T>();

    // Sort index
-    Tensor index_t;
+    phi::DenseTensor index_t;
    index_t.Resize({scores_slice.numel()});
    int *index = index_t.mutable_data<int>(ctx.GetPlace());
    for (int i = 0; i < scores_slice.numel(); ++i) {
@@ -212,7 +210,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
      index_t.Resize({pre_nms_top_n});
    }

-    Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+    phi::DenseTensor scores_sel, bbox_sel, anchor_sel, var_sel;
    scores_sel.mutable_data<T>({index_t.numel(), 1}, ctx.GetPlace());
    bbox_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
    anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,26 +221,26 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);

-    Tensor proposals;
+    phi::DenseTensor proposals;
    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);

    ClipTiledBoxes<T>(ctx, im_info_slice, proposals, &proposals, false);

-    Tensor keep;
+    phi::DenseTensor keep;
    FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
    // Handle the case when there is no keep index left
    if (keep.numel() == 0) {
      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
      set_zero(ctx, &bbox_sel, static_cast<T>(0));
-      Tensor scores_filter;
+      phi::DenseTensor scores_filter;
      scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
      set_zero(ctx, &scores_filter, static_cast<T>(0));
      return std::make_pair(bbox_sel, scores_filter);
    }

-    Tensor scores_filter;
+    phi::DenseTensor scores_filter;
    bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
    scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
@@ -251,7 +249,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
      return std::make_pair(bbox_sel, scores_filter);
    }

-    Tensor keep_nms =
+    phi::DenseTensor keep_nms =
        phi::funcs::NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);

    if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -28,24 +28,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 namespace {
 template <typename T>
-static std::pair<Tensor, Tensor> ProposalForOneImage(
+static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
    const phi::GPUContext &ctx,
-    const Tensor &im_info,
-    const Tensor &anchors,
-    const Tensor &variances,
-    const Tensor &bbox_deltas,  // [M, 4]
-    const Tensor &scores,       // [N, 1]
+    const phi::DenseTensor &im_info,
+    const phi::DenseTensor &anchors,
+    const phi::DenseTensor &variances,
+    const phi::DenseTensor &bbox_deltas,  // [M, 4]
+    const phi::DenseTensor &scores,       // [N, 1]
    int pre_nms_top_n,
    int post_nms_top_n,
    float nms_thresh,
    float min_size,
    float eta) {
  // 1. pre nms
-  Tensor scores_sort, index_sort;
+  phi::DenseTensor scores_sort, index_sort;
  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
  int num = scores.numel();
  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
@@ -54,7 +52,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  index_sort.Resize({pre_nms_num, 1});

  // 2. box decode and clipping
-  Tensor proposals;
+  phi::DenseTensor proposals;
  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());

  {
@@ -68,7 +66,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  }

  // 3. filter
-  Tensor keep_index, keep_num_t;
+  phi::DenseTensor keep_index, keep_num_t;
  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
  min_size = std::max(min_size, 1.0f);
@@ -90,7 +88,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  ctx.Wait();
  keep_index.Resize({keep_num});

-  Tensor scores_filter, proposals_filter;
+  phi::DenseTensor scores_filter, proposals_filter;
  // Handle the case when there is no keep index left
  if (keep_num == 0) {
    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
@@ -110,13 +108,13 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  }

  // 4. nms
-  Tensor keep_nms;
+  phi::DenseTensor keep_nms;
  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
    keep_nms.Resize({post_nms_top_n});
  }

-  Tensor scores_nms, proposals_nms;
+  phi::DenseTensor scores_nms, proposals_nms;
  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
@@ -171,7 +169,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    int64_t h_bbox = bbox_dim[2];
    int64_t w_bbox = bbox_dim[3];

-    Tensor bbox_deltas_swap, scores_swap;
+    phi::DenseTensor bbox_deltas_swap, scores_swap;
    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
                                     dev_ctx.GetPlace());
    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
@@ -200,14 +198,14 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    std::vector<int> tmp_num;

    for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      phi::DenseTensor scores_slice = scores_swap.Slice(i, i + 1);

      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
      scores_slice.Resize({h_score * w_score * c_score, 1});

-      std::pair<Tensor, Tensor> box_score_pair =
+      std::pair<phi::DenseTensor, phi::DenseTensor> box_score_pair =
          ProposalForOneImage<T>(dev_ctx,
                                 im_info_slice,
                                 anchors,
@@ -220,8 +218,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
                                 min_size,
                                 eta);

-      Tensor &proposals = box_score_pair.first;
-      Tensor &scores = box_score_pair.second;
+      phi::DenseTensor &proposals = box_score_pair.first;
+      phi::DenseTensor &scores = box_score_pair.second;

      memory::Copy(place,
                   rpn_rois_data + num_proposals * 4,

--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -29,8 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class GenerateProposalsV2Op : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct IouFunction {
 public:
@@ -182,21 +180,21 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
    auto M = y->dims()[0];

    out->mutable_data<T>({N, M}, place);
-    Tensor xt(_type);
-    Tensor yt(_type);
+    phi::DenseTensor xt(_type);
+    phi::DenseTensor yt(_type);
    xt.mutable_data<T>({4, N}, place);
    yt.mutable_data<T>({4, M}, place);
    std::vector<int> vec_trans = {1, 0};
    F.Transpose(x, &xt, vec_trans);
    F.Transpose(y, &yt, vec_trans);
-    Tensor xmin1 = xt.Slice(0, 1);
-    Tensor ymin1 = xt.Slice(1, 2);
-    Tensor xmax1 = xt.Slice(2, 3);
-    Tensor ymax1 = xt.Slice(3, 4);
-    Tensor xmin2 = yt.Slice(0, 1);
-    Tensor ymin2 = yt.Slice(1, 2);
-    Tensor xmax2 = yt.Slice(2, 3);
-    Tensor ymax2 = yt.Slice(3, 4);
+    phi::DenseTensor xmin1 = xt.Slice(0, 1);
+    phi::DenseTensor ymin1 = xt.Slice(1, 2);
+    phi::DenseTensor xmax1 = xt.Slice(2, 3);
+    phi::DenseTensor ymax1 = xt.Slice(3, 4);
+    phi::DenseTensor xmin2 = yt.Slice(0, 1);
+    phi::DenseTensor ymin2 = yt.Slice(1, 2);
+    phi::DenseTensor xmax2 = yt.Slice(2, 3);
+    phi::DenseTensor ymax2 = yt.Slice(3, 4);
    xmin1.Resize({N, 1});
    ymin1.Resize({N, 1});
    xmax1.Resize({N, 1});
@@ -206,12 +204,12 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
    xmax2.Resize({1, M});
    ymax2.Resize({1, M});

-    Tensor w1(_type);
-    Tensor h1(_type);
-    Tensor w2(_type);
-    Tensor h2(_type);
-    Tensor area1(_type);
-    Tensor area2(_type);
+    phi::DenseTensor w1(_type);
+    phi::DenseTensor h1(_type);
+    phi::DenseTensor w2(_type);
+    phi::DenseTensor h2(_type);
+    phi::DenseTensor area1(_type);
+    phi::DenseTensor area2(_type);
    w1.mutable_data<T>({N, 1}, place);
    h1.mutable_data<T>({N, 1}, place);
    w2.mutable_data<T>({1, M}, place);
@@ -231,10 +229,10 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
    F.Mul(&w1, &h1, &area1);
    F.Mul(&w2, &h2, &area2);

-    Tensor inter_xmax(_type);
-    Tensor inter_ymax(_type);
-    Tensor inter_xmin(_type);
-    Tensor inter_ymin(_type);
+    phi::DenseTensor inter_xmax(_type);
+    phi::DenseTensor inter_ymax(_type);
+    phi::DenseTensor inter_xmin(_type);
+    phi::DenseTensor inter_ymin(_type);
    inter_xmax.mutable_data<T>({N, M}, place);
    inter_ymax.mutable_data<T>({N, M}, place);
    inter_xmin.mutable_data<T>({N, M}, place);
@@ -244,8 +242,8 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
    F.Maximum(&xmin1, &xmin2, &inter_xmin);
    F.Maximum(&ymin1, &ymin2, &inter_ymin);

-    Tensor inter_w(_type);
-    Tensor inter_h(_type);
+    phi::DenseTensor inter_w(_type);
+    phi::DenseTensor inter_h(_type);
    inter_w.mutable_data<T>({N, M}, place);
    inter_h.mutable_data<T>({N, M}, place);
    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
@@ -255,14 +253,14 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
      F.Adds(&inter_w, 1.0f, &inter_w);
      F.Adds(&inter_h, 1.0f, &inter_h);
    }
-    Tensor zeros(_type);
+    phi::DenseTensor zeros(_type);
    zeros.mutable_data<T>({1}, place);
    FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
    F.Maximum(&inter_w, &zeros, &inter_w);
    F.Maximum(&inter_h, &zeros, &inter_h);

    F.Mul(&inter_w, &inter_h, out);
-    Tensor union_area(_type);
+    phi::DenseTensor union_area(_type);
    union_area.mutable_data<T>({N, M}, place);
    F.Add(&area1, &area2, &union_area);
    F.Sub(&union_area, out, &union_area);

--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct IouFunction {
 public:
@@ -108,21 +106,21 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
    auto M = y->dims()[0];

    out->mutable_data<T>({N, M}, place);
-    Tensor xt(_type);
-    Tensor yt(_type);
+    phi::DenseTensor xt(_type);
+    phi::DenseTensor yt(_type);
    xt.mutable_data<T>({4, N}, place);
    yt.mutable_data<T>({4, M}, place);
    std::vector<int> vec_trans = {1, 0};
    F.Transpose(x, &xt, vec_trans);
    F.Transpose(y, &yt, vec_trans);
-    Tensor xmin1 = xt.Slice(0, 1);
-    Tensor ymin1 = xt.Slice(1, 2);
-    Tensor xmax1 = xt.Slice(2, 3);
-    Tensor ymax1 = xt.Slice(3, 4);
-    Tensor xmin2 = yt.Slice(0, 1);
-    Tensor ymin2 = yt.Slice(1, 2);
-    Tensor xmax2 = yt.Slice(2, 3);
-    Tensor ymax2 = yt.Slice(3, 4);
+    phi::DenseTensor xmin1 = xt.Slice(0, 1);
+    phi::DenseTensor ymin1 = xt.Slice(1, 2);
+    phi::DenseTensor xmax1 = xt.Slice(2, 3);
+    phi::DenseTensor ymax1 = xt.Slice(3, 4);
+    phi::DenseTensor xmin2 = yt.Slice(0, 1);
+    phi::DenseTensor ymin2 = yt.Slice(1, 2);
+    phi::DenseTensor xmax2 = yt.Slice(2, 3);
+    phi::DenseTensor ymax2 = yt.Slice(3, 4);
    xmin1.Resize({N, 1});
    ymin1.Resize({N, 1});
    xmax1.Resize({N, 1});
@@ -132,12 +130,12 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
    xmax2.Resize({1, M});
    ymax2.Resize({1, M});

-    Tensor w1(_type);
-    Tensor h1(_type);
-    Tensor w2(_type);
-    Tensor h2(_type);
-    Tensor area1(_type);
-    Tensor area2(_type);
+    phi::DenseTensor w1(_type);
+    phi::DenseTensor h1(_type);
+    phi::DenseTensor w2(_type);
+    phi::DenseTensor h2(_type);
+    phi::DenseTensor area1(_type);
+    phi::DenseTensor area2(_type);
    w1.mutable_data<T>({N, 1}, place);
    h1.mutable_data<T>({N, 1}, place);
    w2.mutable_data<T>({1, M}, place);
@@ -157,10 +155,10 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
    F.Mul(&w1, &h1, &area1);
    F.Mul(&w2, &h2, &area2);

-    Tensor inter_xmax(_type);
-    Tensor inter_ymax(_type);
-    Tensor inter_xmin(_type);
-    Tensor inter_ymin(_type);
+    phi::DenseTensor inter_xmax(_type);
+    phi::DenseTensor inter_ymax(_type);
+    phi::DenseTensor inter_xmin(_type);
+    phi::DenseTensor inter_ymin(_type);
    inter_xmax.mutable_data<T>({N, M}, place);
    inter_ymax.mutable_data<T>({N, M}, place);
    inter_xmin.mutable_data<T>({N, M}, place);
@@ -170,8 +168,8 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
    F.Maximum(&xmin1, &xmin2, &inter_xmin);
    F.Maximum(&ymin1, &ymin2, &inter_ymin);

-    Tensor inter_w(_type);
-    Tensor inter_h(_type);
+    phi::DenseTensor inter_w(_type);
+    phi::DenseTensor inter_h(_type);
    inter_w.mutable_data<T>({N, M}, place);
    inter_h.mutable_data<T>({N, M}, place);
    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
@@ -181,14 +179,14 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
      F.Adds(&inter_w, 1.0f, &inter_w);
      F.Adds(&inter_h, 1.0f, &inter_h);
    }
-    Tensor zeros(_type);
+    phi::DenseTensor zeros(_type);
    zeros.mutable_data<T>({1}, place);
    FillNpuTensorWithConstant<T>(&zeros, static_cast<T>(0));
    F.Maximum(&inter_w, &zeros, &inter_w);
    F.Maximum(&inter_h, &zeros, &inter_h);

    F.Mul(&inter_w, &inter_h, out);
-    Tensor union_area(_type);
+    phi::DenseTensor union_area(_type);
    union_area.mutable_data<T>({N, M}, place);
    F.Add(&area1, &area2, &union_area);
    F.Sub(&union_area, out, &union_area);

--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class LocalityAwareNMSOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -252,7 +250,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
    int num_det = 0;

    int64_t class_num = scores->dims()[0];
-    Tensor bbox_slice, score_slice;
+    phi::DenseTensor bbox_slice, score_slice;
    for (int64_t c = 0; c < class_num; ++c) {
      if (c == background_label) continue;

@@ -325,7 +323,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
    auto* bboxes_data = bboxes.data<T>();
    auto* odata = outs->data<T>();
    const T* sdata;
-    Tensor bbox;
+    phi::DenseTensor bbox;
    bbox.Resize({scores.dims()[0], box_size});
    int count = 0;
    for (const auto& it : selected_indices) {
@@ -370,7 +368,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
    int64_t box_dim = boxes.dims()[2];
    int64_t out_dim = box_dim + 2;
    int num_nmsed_out = 0;
-    Tensor boxes_slice, scores_slice;
+    phi::DenseTensor boxes_slice, scores_slice;
    int n = batch_size;
    for (int i = 0; i < n; ++i) {
      scores_slice = scores.Slice(i, i + 1);
@@ -407,7 +405,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
        int64_t s = batch_starts[i];
        int64_t e = batch_starts[i + 1];
        if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
          LocalityAwareNMSOutput(dev_ctx,
                                 scores_slice,
                                 boxes_slice,

--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class MatrixNMSOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 inline std::vector<size_t> GetNmsLodFromRoisNum(
    const phi::DenseTensor* rois_num) {
  std::vector<size_t> rois_lod;
@@ -228,7 +226,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
    int num_det = 0;

    int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
-    Tensor bbox_slice, score_slice;
+    phi::DenseTensor bbox_slice, score_slice;
    for (int64_t c = 0; c < class_num; ++c) {
      if (c == background_label) continue;
      if (scores_size == 3) {
@@ -319,7 +317,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
    auto* bboxes_data = bboxes.data<T>();
    auto* odata = outs->data<T>();
    const T* sdata;
-    Tensor bbox;
+    phi::DenseTensor bbox;
    bbox.Resize({scores.dims()[0], box_size});
    int count = 0;
    for (const auto& it : selected_indices) {
@@ -373,7 +371,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
    int64_t box_dim = boxes->dims()[2];
    int64_t out_dim = box_dim + 2;
    int num_nmsed_out = 0;
-    Tensor boxes_slice, scores_slice;
+    phi::DenseTensor boxes_slice, scores_slice;
    int n = 0;
    if (has_roisnum) {
      n = score_size == 3 ? batch_size : rois_num->numel();
@@ -449,7 +447,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
        int64_t s = batch_starts[i];
        int64_t e = batch_starts[i + 1];
        if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
          if (return_index) {
            int* output_idx =
                index->mutable_data<int>({num_kept, 1}, ctx.GetPlace());

--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 using phi::PADDLE_CUDA_NUM_THREADS;
 #define CUDA_BLOCK_SIZE 16


--- a/paddle/fluid/operators/detection/prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PriorBoxNPUKernel : public framework::OpKernel<T> {
 public:
@@ -50,7 +48,7 @@ class PriorBoxNPUKernel : public framework::OpKernel<T> {

    auto place = ctx.GetPlace();

-    Tensor out(input->type());
+    phi::DenseTensor out(input->type());
    auto out_dims = phi::vectorize(boxes->dims());
    out_dims.insert(out_dims.begin(), 2);
    out.Resize(phi::make_ddim(out_dims));
@@ -75,8 +73,8 @@ class PriorBoxNPUKernel : public framework::OpKernel<T> {
    runner.Run(stream);

    out.Resize(phi::make_ddim({out.numel()}));
-    Tensor out_boxes = out.Slice(0, boxes->numel());
-    Tensor out_variances = out.Slice(boxes->numel(), out.numel());
+    phi::DenseTensor out_boxes = out.Slice(0, boxes->numel());
+    phi::DenseTensor out_variances = out.Slice(boxes->numel(), out.numel());

    out_boxes.Resize(boxes->dims());
    out_variances.Resize(variances->dims());

--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -409,9 +407,9 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
  }

  void RetinanetDetectionOutput(const framework::ExecutionContext& ctx,
-                                const std::vector<Tensor>& scores,
-                                const std::vector<Tensor>& bboxes,
-                                const std::vector<Tensor>& anchors,
+                                const std::vector<phi::DenseTensor>& scores,
+                                const std::vector<phi::DenseTensor>& bboxes,
+                                const std::vector<phi::DenseTensor>& anchors,
                                const phi::DenseTensor& im_info,
                                std::vector<std::vector<T>>* nmsed_out,
                                int* num_nmsed_out) const {
@@ -425,11 +423,11 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
    std::map<int, std::vector<std::vector<T>>> preds;
    for (size_t l = 0; l < scores.size(); ++l) {
      // Fetch per level score
-      Tensor scores_per_level = scores[l];
+      phi::DenseTensor scores_per_level = scores[l];
      // Fetch per level bbox
-      Tensor bboxes_per_level = bboxes[l];
+      phi::DenseTensor bboxes_per_level = bboxes[l];
      // Fetch per level anchor
-      Tensor anchors_per_level = anchors[l];
+      phi::DenseTensor anchors_per_level = anchors[l];

      int64_t scores_num = scores_per_level.numel();
      int64_t bboxes_num = bboxes_per_level.numel();
@@ -492,9 +490,9 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
    auto* im_info = ctx.Input<phi::DenseTensor>("ImInfo");
    auto* outs = ctx.Output<phi::DenseTensor>("Out");

-    std::vector<Tensor> boxes_list(boxes.size());
-    std::vector<Tensor> scores_list(scores.size());
-    std::vector<Tensor> anchors_list(anchors.size());
+    std::vector<phi::DenseTensor> boxes_list(boxes.size());
+    std::vector<phi::DenseTensor> scores_list(scores.size());
+    std::vector<phi::DenseTensor> anchors_list(anchors.size());
    for (size_t j = 0; j < boxes_list.size(); ++j) {
      boxes_list[j] = *boxes[j];
      scores_list[j] = *scores[j];
@@ -512,8 +510,8 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
    std::vector<size_t> batch_starts = {0};
    for (int i = 0; i < batch_size; ++i) {
      int num_nmsed_out = 0;
-      std::vector<Tensor> box_per_batch_list(boxes_list.size());
-      std::vector<Tensor> score_per_batch_list(scores_list.size());
+      std::vector<phi::DenseTensor> box_per_batch_list(boxes_list.size());
+      std::vector<phi::DenseTensor> score_per_batch_list(scores_list.size());
      for (size_t j = 0; j < boxes_list.size(); ++j) {
        const auto& score_dims = scores_list[j].dims();
        score_per_batch_list[j] = scores_list[j].Slice(i, i + 1);
@@ -521,7 +519,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
        box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1);
        box_per_batch_list[j].Resize({score_dims[1], box_dim});
      }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);

      std::vector<std::vector<T>> nmsed_out;
      RetinanetDetectionOutput(ctx,
@@ -544,7 +542,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
        int64_t s = batch_starts[i];
        int64_t e = batch_starts[i + 1];
        if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
          MultiClassOutput(dev_ctx, all_nmsed_out[i], &out);
        }
      }
@@ -563,7 +561,8 @@ class RetinanetDetectionOutputOpMaker
  void Make() override {
    AddInput("BBoxes",
             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, 4] represents the "
+             "element is a 3-D phi::DenseTensor with shape [N, Mi, 4] "
+             "represents the "
             "predicted locations of Mi bounding boxes, N is the batch size. "
             "Mi is the number of bounding boxes from i-th FPN level. Each "
             "bounding box has four coordinate values and the layout is "
@@ -571,15 +570,17 @@ class RetinanetDetectionOutputOpMaker
        .AsDuplicable();
    AddInput("Scores",
             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, C] represents the "
+             "element is a 3-D phi::DenseTensor with shape [N, Mi, C] "
+             "represents the "
             "predicted confidence from its FPN level. N is the batch size, "
             "C is the class number (excluding background), Mi is the number "
             "of bounding boxes from i-th FPN level. For each bounding box, "
             "there are total C scores.")
        .AsDuplicable();
-    AddInput("Anchors",
+    AddInput(
+        "Anchors",
        "(List) A list of tensors from multiple FPN levels. Each"
-             "element is a 2-D Tensor with shape [Mi, 4] represents the "
+        "element is a 2-D phi::DenseTensor with shape [Mi, 4] represents the "
        "locations of Mi anchor boxes from i-th FPN level. Each "
        "bounding box has four coordinate values and the layout is "
        "[xmin, ymin, xmax, ymax].")

--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 bool GT_E(T a, T b) {
  return (a > b) || fabs(a - b) < 1e-4;
@@ -600,7 +598,7 @@ class ROIPerspectiveTransformOpMaker
 public:
  void Make() override {
    AddInput("X",
-             "(Tensor), "
+             "(phi::DenseTensor), "
             "the input of ROIPerspectiveTransformOp. "
             "The format of input tensor is NCHW. Where N is batch size, "
             "C is the number of input channels, "
@@ -617,28 +615,28 @@ class ROIPerspectiveTransformOpMaker
             "(x4, y4) is the bottom left coordinates.");
    AddOutput(
        "Out",
-        "(Tensor), "
+        "(phi::DenseTensor), "
        "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
        "(num_rois, channels, transformed_h, transformed_w).");
    AddOutput("Mask",
-              "(Tensor), "
+              "(phi::DenseTensor), "
              "The output mask of ROIPerspectiveTransformOp is a 4-D tensor "
              "with shape "
              "(num_rois, 1, transformed_h, transformed_w).");
    AddOutput("TransformMatrix",
-              "(Tensor), "
+              "(phi::DenseTensor), "
              "The output transform matrix of ROIPerspectiveTransformOp is a "
              "1-D tensor with shape "
              "(num_rois, 9).");
    AddOutput("Out2InIdx",
-              "(Tensor), "
+              "(phi::DenseTensor), "
              "An intermediate tensor used to map indexes of input feature map "
              "and indexes of output feature map."
              "The shape of the tensor is [out_size, 4] and out_size is the "
              "number of elements in output feature map.")
        .AsIntermediate();
    AddOutput("Out2InWeights",
-              "(Tensor), "
+              "(phi::DenseTensor), "
              "An intermediate tensor used to record the weights of bilinear "
              "interpolatein for each element in output. The shape of the "
              "tensor is [out_size, 4] and out_size is the number of elements "

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
 template <typename T,
          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
@@ -113,7 +112,8 @@ void AppendRpns(phi::DenseTensor* out,
 }

 template <typename T>
-std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
+std::vector<phi::DenseTensor> FilterStraddleAnchor(
+    const phi::CPUContext& context,
    const phi::DenseTensor* anchor,
    const float rpn_straddle_thresh,
    T im_height,
@@ -138,23 +138,23 @@ std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
    }
  }
  int inside_num = inds_inside.size();
-  Tensor inds_inside_t;
+  phi::DenseTensor inds_inside_t;
  int* inds_inside_data =
      inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
  std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
-  Tensor inside_anchor_t;
+  phi::DenseTensor inside_anchor_t;
  T* inside_anchor_data =
      inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
  Gather<T>(
      anchor->data<T>(), 4, inds_inside_data, inside_num, inside_anchor_data);
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
  res.emplace_back(inds_inside_t);
  res.emplace_back(inside_anchor_t);
  return res;
 }

 template <typename T>
-Tensor FilterCrowdGt(const phi::CPUContext& context,
+phi::DenseTensor FilterCrowdGt(const phi::CPUContext& context,
                               phi::DenseTensor* gt_boxes,
                               phi::DenseTensor* is_crowd) {
  int gt_num = gt_boxes->dims()[0];
@@ -166,7 +166,7 @@ Tensor FilterCrowdGt(const phi::CPUContext& context,
    }
  }
  int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes;
+  phi::DenseTensor ncrowd_gt_boxes;
  T* ncrowd_gt_boxes_data =
      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
  Gather<T>(gt_boxes->data<T>(),
@@ -300,7 +300,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
 }

 template <typename T>
-std::vector<Tensor> SampleRpnFgBgGt(
+std::vector<phi::DenseTensor> SampleRpnFgBgGt(
    const phi::CPUContext& ctx,
    const phi::DenseTensor& anchor_by_gt_overlap,
    const int rpn_batch_size_per_im,
@@ -322,7 +322,7 @@ std::vector<Tensor> SampleRpnFgBgGt(
  // Calculate the max IoU between anchors and gt boxes
  // Map from anchor to gt box that has highest overlap
  auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
@@ -365,7 +365,8 @@ std::vector<Tensor> SampleRpnFgBgGt(
  for (int i = 0; i < fg_fake_num; ++i) {
    gt_inds.emplace_back(argmax[fg_fake[i]]);
  }
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
+  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
+      bbox_inside_weight_t;
  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
  int* score_index_data =
      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
@@ -381,7 +382,7 @@ std::vector<Tensor> SampleRpnFgBgGt(
  std::copy(bbox_inside_weight.begin(),
            bbox_inside_weight.end(),
            bbox_inside_weight_data);
-  std::vector<Tensor> loc_score_tgtlbl_gt;
+  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
  loc_score_tgtlbl_gt.emplace_back(score_index_t);
  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
@@ -455,30 +456,30 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
    auto gt_boxes_lod = gt_boxes->lod().back();
    auto is_crowd_lod = is_crowd->lod().back();
    for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
      auto* im_info_data = im_info_slice.data<T>();
      auto im_height = im_info_data[0];
      auto im_width = im_info_data[1];
      auto im_scale = im_info_data[2];

      // Filter straddle anchor
-      std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
+      std::vector<phi::DenseTensor> filter_output = FilterStraddleAnchor<T>(
          dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
+      phi::DenseTensor inds_inside = filter_output[0];
+      phi::DenseTensor inside_anchor = filter_output[1];

      // Filter crowd gt
-      Tensor ncrowd_gt_boxes =
+      phi::DenseTensor ncrowd_gt_boxes =
          FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
      auto ncrowd_gt_boxes_et =
          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;

-      Tensor anchor_by_gt_overlap;
+      phi::DenseTensor anchor_by_gt_overlap;
      anchor_by_gt_overlap.mutable_data<T>(
          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
@@ -492,16 +493,16 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                                                    engine,
                                                    use_random);

-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
+      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];

      int loc_num = sampled_loc_index.dims()[0];
      int score_num = sampled_score_index.dims()[0];
      // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
      Gather<int>(inds_inside.data<int>(),
@@ -516,7 +517,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                  sampled_score_index_unmap.data<int>());

      // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
      auto* sampled_anchor_data =
          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
@@ -859,7 +860,8 @@ class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
 };

 template <typename T>
-std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
+std::vector<phi::DenseTensor> FilterCrowdGtBoxLabel(
+    const phi::CPUContext& context,
    phi::DenseTensor* gt_boxes,
    phi::DenseTensor* gt_labels,
    phi::DenseTensor* is_crowd) {
@@ -872,7 +874,7 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
    }
  }
  int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes, ncrowd_gt_labels;
+  phi::DenseTensor ncrowd_gt_boxes, ncrowd_gt_labels;
  T* ncrowd_gt_boxes_data =
      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
  int* ncrowd_gt_labels_data =
@@ -887,14 +889,15 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
              not_crowd_inds.data(),
              ncrowd_num,
              ncrowd_gt_labels_data);
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
  res.emplace_back(ncrowd_gt_boxes);
  res.emplace_back(ncrowd_gt_labels);
  return res;
 }

 template <typename T>
-std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
+std::vector<phi::DenseTensor> GetAllFgBgGt(
+    const phi::CPUContext& ctx,
    const phi::DenseTensor& anchor_by_gt_overlap,
    const phi::DenseTensor& ncrowd_gt_labels,
    const float positive_overlap,
@@ -913,7 +916,7 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
  // Calculate the max IoU between anchors and gt boxes
  // Map from anchor to gt box that has highest overlap
  auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
@@ -961,8 +964,9 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
    gt_inds.emplace_back(argmax[fg_fake[i]]);
  }

-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
-  Tensor fg_num_t;
+  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
+      bbox_inside_weight_t;
+  phi::DenseTensor fg_num_t;
  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
  int* score_index_data =
      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
@@ -980,7 +984,7 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
            bbox_inside_weight.end(),
            bbox_inside_weight_data);
  fg_num_data[0] = fg_fake.size() + 1;
-  std::vector<Tensor> loc_score_tgtlbl_gt;
+  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
  loc_score_tgtlbl_gt.emplace_back(score_index_t);
  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
@@ -1065,35 +1069,35 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
    auto gt_labels_lod = gt_labels->lod().back();
    auto is_crowd_lod = is_crowd->lod().back();
    for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor gt_labels_slice =
+      phi::DenseTensor gt_labels_slice =
          gt_labels->Slice(gt_labels_lod[i], gt_labels_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
      auto* im_info_data = im_info_slice.data<T>();
      auto im_height = im_info_data[0];
      auto im_width = im_info_data[1];
      auto im_scale = im_info_data[2];

      // Filter straddle anchor
-      std::vector<Tensor> filter_output =
+      std::vector<phi::DenseTensor> filter_output =
          FilterStraddleAnchor<T>(dev_ctx, anchor, -1, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
+      phi::DenseTensor inds_inside = filter_output[0];
+      phi::DenseTensor inside_anchor = filter_output[1];

      // Filter crowd gt
-      std::vector<Tensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
+      std::vector<phi::DenseTensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
          dev_ctx, &gt_boxes_slice, &gt_labels_slice, &is_crowd_slice);
-      Tensor ncrowd_gt_boxes = ncrowd_output[0];
-      Tensor ncrowd_gt_labels = ncrowd_output[1];
+      phi::DenseTensor ncrowd_gt_boxes = ncrowd_output[0];
+      phi::DenseTensor ncrowd_gt_labels = ncrowd_output[1];

      auto ncrowd_gt_boxes_et =
          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;

-      Tensor anchor_by_gt_overlap;
+      phi::DenseTensor anchor_by_gt_overlap;
      anchor_by_gt_overlap.mutable_data<T>(
          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
@@ -1105,17 +1109,17 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
                                                 negative_overlap,
                                                 engine);

-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-      Tensor sampled_fg_num = loc_score_tgtlbl_gt[5];
+      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
+      phi::DenseTensor sampled_fg_num = loc_score_tgtlbl_gt[5];

      int loc_num = sampled_loc_index.dims()[0];
      int score_num = sampled_score_index.dims()[0];
      // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
      Gather<int>(inds_inside.data<int>(),
@@ -1130,7 +1134,7 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
                  sampled_score_index_unmap.data<int>());

      // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
      auto* sampled_anchor_data =
          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);

--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;

@@ -123,10 +121,10 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    Tensor *Out = context.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
    T gamma = static_cast<T>(context.Attr<float>("gamma"));
    T alpha = static_cast<T>(context.Attr<float>("alpha"));
    auto x_dims = X->dims();
@@ -154,12 +152,13 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const Tensor *dOut =
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const phi::DenseTensor *dOut =
        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dX =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
    auto dx_data = dX->mutable_data<T>(context.GetPlace());
    T gamma = static_cast<T>(context.Attr<float>("gamma"));
    T alpha = static_cast<T>(context.Attr<float>("alpha"));

--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
@@ -22,16 +22,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SigmoidFocalLossKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    Tensor *Out = context.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
    T gamma = static_cast<T>(context.Attr<float>("gamma"));
    T alpha = static_cast<T>(context.Attr<float>("alpha"));
    auto out_data = Out->mutable_data<T>(context.GetPlace());
@@ -79,12 +77,13 @@ template <typename DeviceContext, typename T>
 class SigmoidFocalLossGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const Tensor *dOut =
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const phi::DenseTensor *dOut =
        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dX =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
    auto dx_data = dX->mutable_data<T>(context.GetPlace());
    T gamma = static_cast<T>(context.Attr<float>("gamma"));
    T alpha = static_cast<T>(context.Attr<float>("alpha"));

--- a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
@@ -77,7 +77,7 @@ class YoloBoxMLUKernel : public framework::OpKernel<T> {
    MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
    MLUOpTensorDesc img_size_desc(
        *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
-    Tensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
    anchors_temp.Resize({size});
    paddle::framework::TensorFromVector(
        anchors, ctx.device_context(), &anchors_temp);

--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 class DetectionMAPOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class DGCClipByNormKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class DropoutMLUKernel : public framework::OpKernel<T> {
 public:
@@ -106,8 +104,8 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
    }

    // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
-    Tensor scale_tensor(x->dtype());
-    Tensor bias_tensor(x->dtype());
+    phi::DenseTensor scale_tensor(x->dtype());
+    phi::DenseTensor bias_tensor(x->dtype());
    scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
    bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
    MLUCnnlTensorDesc scale_desc(scale_tensor);
@@ -157,7 +155,7 @@ class DropoutGradMLUKernel : public framework::OpKernel<T> {
    }

    // cast mask from uint8 to float32/float16
-    Tensor cast_mask(grad_x->dtype());
+    phi::DenseTensor cast_mask(grad_x->dtype());
    cast_mask.Resize(mask->dims());
    cast_mask.mutable_data<T>(ctx.GetPlace());


--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class DropoutNPUKernel : public framework::OpKernel<T> {
 public:
@@ -56,8 +54,8 @@ class DropoutNPUKernel : public framework::OpKernel<T> {

    // only achieve the default `upscale_in_train` method
    if (!is_test) {
-      Tensor tmp_x(x->dtype());
-      Tensor tmp_out(out->dtype());
+      phi::DenseTensor tmp_x(x->dtype());
+      phi::DenseTensor tmp_out(out->dtype());
      tmp_x.ShareDataWith(*x);
      tmp_out.ShareDataWith(*out);
      if (x->dims().size() == 1) {
@@ -80,7 +78,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
        seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
      }

-      Tensor keep_prob_tensor(x->dtype());
+      phi::DenseTensor keep_prob_tensor(x->dtype());
      keep_prob_tensor.mutable_data<T>({1}, ctx.GetPlace());
      FillNpuTensorWithConstant<T>(&keep_prob_tensor,
                                   static_cast<T>(keep_prob));
@@ -89,14 +87,14 @@ class DropoutNPUKernel : public framework::OpKernel<T> {

      // mask used in `DropOutGenMask` NPU OP is different from
      // the output `Mask`.
-      Tensor npu_mask(experimental::DataType::UINT8);
+      phi::DenseTensor npu_mask(experimental::DataType::UINT8);
      uint32_t length = (x->numel() + 128 - 1) / 128 * 128;
      npu_mask.Resize(phi::make_ddim({length / 8}));
      npu_mask.mutable_data<uint8_t>(ctx.GetPlace());

      // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
      // OP must be a scalar with shape[0]. At present, the shape
-      // of the `prob` Tensor of this OP is forced to be set to 0
+      // of the `prob` phi::DenseTensor of this OP is forced to be set to 0
      // in `npu_op_runner.cc`, which needs to be optimized later.
      NpuOpRunner runner_gen_mask;
      runner_gen_mask.SetType("DropOutGenMask")
@@ -116,7 +114,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
      runner_dropout.Run(stream);

      // cast `out` from float/float16 to bool
-      Tensor cast_mask(experimental::DataType::BOOL);
+      phi::DenseTensor cast_mask(experimental::DataType::BOOL);
      cast_mask.Resize(mask->dims());
      cast_mask.mutable_data<bool>(ctx.GetPlace());
      auto dst_dtype_bool =
@@ -176,7 +174,7 @@ class DropoutGradNPUKernel : public framework::OpKernel<T> {
    }

    // cast mask from uint8 to float32/float16
-    Tensor cast_mask(dx->dtype());
+    phi::DenseTensor cast_mask(dx->dtype());
    cast_mask.Resize(mask->dims());
    cast_mask.mutable_data<T>(ctx.GetPlace());
    auto dst_dtype =

--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
@@ -16,7 +16,6 @@ limitations under the License. */

 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;

 template <typename T>
 class ElementwiseAddMLUKernel : public framework::OpKernel<T> {

--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */

 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;

 template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
@@ -53,7 +52,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
      const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
      runner.Run(dev_ctx.stream());
    } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
      NpuElementWiseOpBroadcast<T>(
          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
      const auto& runner =
@@ -96,7 +95,7 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
          }
        }
        if (!reduce_axes.empty()) {
-          Tensor tmp;
+          phi::DenseTensor tmp;
          tmp.ShareDataWith(*dx);
          tmp.Resize(phi::make_ddim(dst_dims_vec));
          const auto& runner =
@@ -128,7 +127,7 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
          }
        }
        if (!reduce_axes.empty()) {
-          Tensor tmp;
+          phi::DenseTensor tmp;
          tmp.ShareDataWith(*dy);
          tmp.Resize(phi::make_ddim(dst_dims_vec));
          const auto& runner =

--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -24,7 +24,6 @@ namespace operators {
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;

  void InferShape(framework::InferShapeContext* ctx) const override {
    auto y_grad_name = framework::GradVarName("Y");

--- a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
 public:
@@ -66,7 +64,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);

    // compute dout/y == 1/y * dout
-    Tensor dout_div_y(dout->dtype());
+    phi::DenseTensor dout_div_y(dout->dtype());
    dout_div_y.Resize(dout->dims());
    dout_div_y.mutable_data<T>(ctx.GetPlace());
    MLUBinary<DIV>(ctx,
@@ -110,7 +108,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {

    if (dy) {
      // compute dy = -out * (dout/y) = -out/y * dout
-      Tensor neg_out(out->type());
+      phi::DenseTensor neg_out(out->type());
      neg_out.mutable_data<T>(out->dims(), ctx.GetPlace());

      MLUCnnlTensorDesc out_desc(*out);
@@ -121,7 +119,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
                    out_desc.get(),
                    GetBasePtr(&neg_out));

-      Tensor dy_temp(y->dtype());
+      phi::DenseTensor dy_temp(y->dtype());
      dy_temp.Resize(dout->dims());
      dy_temp.mutable_data<T>(ctx.GetPlace());


--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
 public:
@@ -66,38 +64,38 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
    if (dx) {
      dx->mutable_data<T>(place);

-      Tensor tensor_one(y->type());
+      phi::DenseTensor tensor_one(y->type());
      tensor_one.mutable_data<float>({1}, place);
      FillNpuTensorWithConstant<float>(&tensor_one, static_cast<float>(1.0));

      // Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP.
      // Because `Power` will cause precision overflow, that is, `float_status`
      // will be set to 1.
-      Tensor y_div(y->type());
+      phi::DenseTensor y_div(y->type());
      y_div.mutable_data<T>(y->dims(), place);
      const auto& runner_one_div_y =
          NpuOpRunner("Div", {tensor_one, *y}, {y_div}, {});
      runner_one_div_y.Run(stream);

-      Tensor tensor_zeros(x->type());
+      phi::DenseTensor tensor_zeros(x->type());
      tensor_zeros.mutable_data<T>(x->dims(), place);
      const auto& runner_tensor_zeros =
          NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
      runner_tensor_zeros.Run(stream);

-      Tensor x_zero(experimental::DataType::BOOL);
+      phi::DenseTensor x_zero(experimental::DataType::BOOL);
      x_zero.mutable_data<bool>(x->dims(), place);
      const auto& runner_x_zero =
          NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
      runner_x_zero.Run(stream);

-      Tensor x_nozero(experimental::DataType::BOOL);
+      phi::DenseTensor x_nozero(experimental::DataType::BOOL);
      x_nozero.mutable_data<bool>(x->dims(), place);
      const auto& runner_x_nonzero =
          NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
      runner_x_nonzero.Run(stream);

-      Tensor x_nozero_f(x->type());
+      phi::DenseTensor x_nozero_f(x->type());
      x_nozero_f.mutable_data<T>(x->dims(), place);
      const auto& runner_x_nonzero_f =
          NpuOpRunner("Cast",
@@ -106,7 +104,7 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
                      {{"dst_type", static_cast<int32_t>(0)}});
      runner_x_nonzero_f.Run(stream);

-      Tensor x_grad_w(x->type());
+      phi::DenseTensor x_grad_w(x->type());
      x_grad_w.mutable_data<T>(x->dims(), place);
      const auto& runner_x_grad_w =
          NpuOpRunner("Mul", {x_nozero_f, y_div}, {x_grad_w}, {});
@@ -120,19 +118,19 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
    if (dy) {
      dy->mutable_data<T>(place);

-      Tensor neg_out(out->type());
+      phi::DenseTensor neg_out(out->type());
      neg_out.mutable_data<T>(out->dims(), place);
      const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
      runner_neg_out.Run(stream);

-      Tensor tmp_mul(out->type());
+      phi::DenseTensor tmp_mul(out->type());
      tmp_mul.mutable_data<T>(out->dims(), place);
      const auto& runner_mul =
          NpuOpRunner("Mul", {neg_out, *dout}, {tmp_mul}, {});
      runner_mul.Run(stream);

      if (dy->dims() != dout->dims()) {
-        Tensor reduced_tmp_mul(y->type());
+        phi::DenseTensor reduced_tmp_mul(y->type());
        reduced_tmp_mul.mutable_data<T>(y->dims(), place);

        std::vector<int64_t> axes;

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
 public:
@@ -51,7 +49,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
      const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
      runner.Run(stream);
    } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
      NpuElementWiseOpBroadcast<T>(
          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
      const auto& runner =
@@ -85,7 +83,7 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
    auto x_dims = x->dims();
    auto y_dims = y->dims();
    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
    NpuElementWiseOpBroadcast<T>(
        dev_ctx, x, y, axis, &transformed_x, &transformed_y);

@@ -99,9 +97,9 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
    if (dx && dy) {
      dx->mutable_data<T>(ctx.GetPlace());
      dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
      tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
      tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());

      const auto& runner = NpuOpRunner("MaximumGrad",
@@ -153,12 +151,12 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
      }

    } else if (dx) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
      zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));

      dx->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
      tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());

      const auto& runner = NpuOpRunner("MaximumGrad",
@@ -190,12 +188,12 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
      }

    } else if (dy) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
      zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));

      dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
      tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());

      const auto& runner = NpuOpRunner("MaximumGrad",

--- a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_npu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_npu.h
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
--- a/paddle/fluid/operators/expand_as_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
--- a/paddle/fluid/operators/fill_constant_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ b/paddle/fluid/operators/flatten_op_npu.cc
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
--- a/paddle/fluid/operators/fused/attn_gemm_int8.h
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
--- a/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
--- a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
--- a/paddle/fluid/operators/fused/fusion_gru_op.h
+++ b/paddle/fluid/operators/fused/fusion_gru_op.h
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fused/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.h
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
--- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
--- a/paddle/fluid/operators/fused/xpu_fused_common_function.h
+++ b/paddle/fluid/operators/fused/xpu_fused_common_function.h
--- a/paddle/fluid/operators/fused/yolo_box_head_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
--- a/paddle/fluid/operators/gather_nd_op_mlu.cc
+++ b/paddle/fluid/operators/gather_nd_op_mlu.cc
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
--- a/paddle/fluid/operators/gather_scatter_kernel.cc
+++ b/paddle/fluid/operators/gather_scatter_kernel.cc
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
--- a/paddle/fluid/operators/gather_scatter_kernel.h
+++ b/paddle/fluid/operators/gather_scatter_kernel.h
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
--- a/paddle/fluid/operators/gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_mlu.cc
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_npu.cc
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
--- a/paddle/fluid/operators/graph_khop_sampler_op.h
+++ b/paddle/fluid/operators/graph_khop_sampler_op.h
--- a/paddle/fluid/operators/grid_sampler_op_mlu.cc
+++ b/paddle/fluid/operators/grid_sampler_op_mlu.cc
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ b/paddle/fluid/operators/instance_norm_op_npu.cc
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
--- a/paddle/fluid/operators/interpolate_v2_op_mlu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
--- a/paddle/fluid/operators/label_smooth_op_mlu.cc
+++ b/paddle/fluid/operators/label_smooth_op_mlu.cc
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
--- a/paddle/fluid/operators/layer_norm_op_mlu.cc
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
--- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
--- a/paddle/fluid/operators/masked_select_op_mlu.cc
+++ b/paddle/fluid/operators/masked_select_op_mlu.cc
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
--- a/paddle/fluid/operators/match_matrix_tensor_op.h
+++ b/paddle/fluid/operators/match_matrix_tensor_op.h
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
--- a/paddle/fluid/operators/matmul_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
--- a/paddle/fluid/operators/matmul_v2_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_mlu.cc
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
--- a/paddle/fluid/operators/meshgrid_op_mlu.cc
+++ b/paddle/fluid/operators/meshgrid_op_mlu.cc
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ b/paddle/fluid/operators/metrics/precision_recall_op.h
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/norm_op_npu.cc
+++ b/paddle/fluid/operators/norm_op_npu.cc
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ b/paddle/fluid/operators/p_norm_op_npu.cc
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rnn_op_mlu.cc
+++ b/paddle/fluid/operators/rnn_op_mlu.cc
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
--- a/paddle/fluid/operators/roi_align_op_mlu.cc
+++ b/paddle/fluid/operators/roi_align_op_mlu.cc
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
--- a/paddle/fluid/operators/scatter_op_mlu.cc
+++ b/paddle/fluid/operators/scatter_op_mlu.cc
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
--- a/paddle/fluid/operators/seed_op.h
+++ b/paddle/fluid/operators/seed_op.h
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
--- a/paddle/fluid/operators/set_value_op_mlu.cc
+++ b/paddle/fluid/operators/set_value_op_mlu.cc
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
--- a/paddle/fluid/operators/shape_op_mlu.cc
+++ b/paddle/fluid/operators/shape_op_mlu.cc
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ b/paddle/fluid/operators/similarity_focus_op.h
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
--- a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
--- a/paddle/fluid/operators/stack_op_mlu.cc
+++ b/paddle/fluid/operators/stack_op_mlu.cc
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
--- a/paddle/fluid/operators/stft_op.h
+++ b/paddle/fluid/operators/stft_op.h
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ b/paddle/fluid/operators/take_along_axis_op_npu.cc
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
--- a/paddle/fluid/operators/tile_op_mlu.cc
+++ b/paddle/fluid/operators/tile_op_mlu.cc
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ b/paddle/fluid/operators/uniform_random_op_npu.cc
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
--- a/paddle/fluid/operators/var_conv_2d_op.h
+++ b/paddle/fluid/operators/var_conv_2d_op.h
--- a/paddle/fluid/operators/where_index_op_mlu.cc
+++ b/paddle/fluid/operators/where_index_op_mlu.cc
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc